2005-10-18 00:28:58 +00:00
|
|
|
//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
|
2005-08-16 17:14:42 +00:00
|
|
|
//
|
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
2007-12-29 20:36:04 +00:00
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2005-08-16 17:14:42 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
2005-10-16 05:39:50 +00:00
|
|
|
// This file implements the PPCISelLowering class.
|
2005-08-16 17:14:42 +00:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-10-14 23:59:06 +00:00
|
|
|
#include "PPCISelLowering.h"
|
2012-12-03 16:50:05 +00:00
|
|
|
#include "MCTargetDesc/PPCPredicates.h"
|
2006-11-16 22:43:37 +00:00
|
|
|
#include "PPCMachineFunctionInfo.h"
|
2010-03-12 02:00:43 +00:00
|
|
|
#include "PPCPerfectShuffle.h"
|
2005-10-14 23:59:06 +00:00
|
|
|
#include "PPCTargetMachine.h"
|
2013-05-13 19:34:37 +00:00
|
|
|
#include "PPCTargetObjectFile.h"
|
2007-09-07 04:06:50 +00:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
2014-05-11 19:29:11 +00:00
|
|
|
#include "llvm/ADT/StringSwitch.h"
|
2014-05-31 00:07:32 +00:00
|
|
|
#include "llvm/ADT/Triple.h"
|
2007-03-06 00:59:59 +00:00
|
|
|
#include "llvm/CodeGen/CallingConvLower.h"
|
2005-08-16 17:14:42 +00:00
|
|
|
#include "llvm/CodeGen/MachineFrameInfo.h"
|
|
|
|
#include "llvm/CodeGen/MachineFunction.h"
|
2005-08-26 21:23:58 +00:00
|
|
|
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
2007-12-31 04:13:23 +00:00
|
|
|
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
2005-08-16 17:14:42 +00:00
|
|
|
#include "llvm/CodeGen/SelectionDAG.h"
|
2010-02-15 22:37:53 +00:00
|
|
|
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
|
2013-01-02 11:36:10 +00:00
|
|
|
#include "llvm/IR/CallingConv.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
|
|
|
#include "llvm/IR/Function.h"
|
|
|
|
#include "llvm/IR/Intrinsics.h"
|
2006-11-10 02:08:47 +00:00
|
|
|
#include "llvm/Support/CommandLine.h"
|
2009-07-08 20:53:28 +00:00
|
|
|
#include "llvm/Support/ErrorHandling.h"
|
2012-03-17 18:46:09 +00:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2009-07-08 20:53:28 +00:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
2012-03-17 18:46:09 +00:00
|
|
|
#include "llvm/Target/TargetOptions.h"
|
2005-08-16 17:14:42 +00:00
|
|
|
using namespace llvm;
|
|
|
|
|
2012-06-04 02:21:00 +00:00
|
|
|
static cl::opt<bool> DisablePPCPreinc("disable-ppc-preinc",
|
|
|
|
cl::desc("disable preincrement load/store generation on PPC"), cl::Hidden);
|
2006-11-10 02:08:47 +00:00
|
|
|
|
2012-06-10 19:32:29 +00:00
|
|
|
static cl::opt<bool> DisableILPPref("disable-ppc-ilp-pref",
|
|
|
|
cl::desc("disable setting the node scheduling preference to ILP on PPC"), cl::Hidden);
|
|
|
|
|
2013-03-15 15:27:13 +00:00
|
|
|
static cl::opt<bool> DisablePPCUnaligned("disable-ppc-unaligned",
|
|
|
|
cl::desc("disable unaligned load/store generation on PPC"), cl::Hidden);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
// FIXME: Remove this once the bug has been fixed!
|
|
|
|
extern cl::opt<bool> ANDIGlueBug;
|
|
|
|
|
2014-05-31 00:07:32 +00:00
|
|
|
static TargetLoweringObjectFile *createTLOF(const Triple &TT) {
|
2014-06-02 17:29:07 +00:00
|
|
|
// If it isn't a Mach-O file then it's going to be a linux ELF
|
|
|
|
// object file.
|
2014-05-31 00:07:32 +00:00
|
|
|
if (TT.isOSDarwin())
|
2010-03-15 21:09:38 +00:00
|
|
|
return new TargetLoweringObjectFileMachO();
|
2014-06-02 17:29:07 +00:00
|
|
|
|
|
|
|
return new PPC64LinuxTargetObjectFile();
|
2009-07-28 03:13:23 +00:00
|
|
|
}
|
|
|
|
|
2006-11-02 01:44:04 +00:00
|
|
|
PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
|
2014-05-31 00:07:32 +00:00
|
|
|
: TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))),
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget(*TM.getSubtargetImpl()) {
|
2005-10-21 00:02:42 +00:00
|
|
|
setPow2DivIsCheap();
|
2008-07-31 18:13:12 +00:00
|
|
|
|
2005-09-27 22:18:25 +00:00
|
|
|
// Use _setjmp/_longjmp instead of setjmp/longjmp.
|
2006-12-10 23:12:42 +00:00
|
|
|
setUseUnderscoreSetJmp(true);
|
|
|
|
setUseUnderscoreLongJmp(true);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2010-10-10 18:34:00 +00:00
|
|
|
// On PPC32/64, arguments smaller than 4/8 bytes are extended, so all
|
|
|
|
// arguments are at least 4/8 bytes aligned.
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
2012-07-02 22:39:56 +00:00
|
|
|
setMinStackArgumentAlignment(isPPC64 ? 8:4);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2005-08-16 17:14:42 +00:00
|
|
|
// Set up the register classes.
|
2012-04-20 06:31:50 +00:00
|
|
|
addRegisterClass(MVT::i32, &PPC::GPRCRegClass);
|
|
|
|
addRegisterClass(MVT::f32, &PPC::F4RCRegClass);
|
|
|
|
addRegisterClass(MVT::f64, &PPC::F8RCRegClass);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-10-04 00:56:09 +00:00
|
|
|
// PowerPC has an i16 but no i8 (or i1) SEXTLOAD
|
2009-08-11 20:47:22 +00:00
|
|
|
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
|
|
|
|
setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
|
2008-01-23 20:39:46 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-10 23:58:45 +00:00
|
|
|
// PowerPC has pre-inc load and store's.
|
2009-08-11 20:47:22 +00:00
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
|
|
|
|
setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
|
|
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
|
|
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
|
|
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
|
|
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
|
|
|
|
setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
|
2006-11-09 19:11:50 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.useCRBits()) {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (isPPC64 || Subtarget.hasFPCVT()) {
|
2014-03-05 22:14:00 +00:00
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Promote);
|
|
|
|
AddPromotedToType (ISD::SINT_TO_FP, MVT::i1,
|
|
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Promote);
|
|
|
|
AddPromotedToType (ISD::UINT_TO_FP, MVT::i1,
|
|
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
|
|
|
} else {
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i1, Custom);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i1, Custom);
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
|
|
|
|
// PowerPC does not support direct load / store of condition registers
|
|
|
|
setOperationAction(ISD::LOAD, MVT::i1, Custom);
|
|
|
|
setOperationAction(ISD::STORE, MVT::i1, Custom);
|
|
|
|
|
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
|
|
if (ANDIGlueBug)
|
|
|
|
setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
|
|
|
|
|
|
|
|
setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
|
|
|
|
setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
|
|
|
|
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
|
|
|
|
setTruncStoreAction(MVT::i32, MVT::i1, Expand);
|
|
|
|
setTruncStoreAction(MVT::i16, MVT::i1, Expand);
|
|
|
|
setTruncStoreAction(MVT::i8, MVT::i1, Expand);
|
|
|
|
|
|
|
|
addRegisterClass(MVT::i1, &PPC::CRBITRCRegClass);
|
|
|
|
}
|
|
|
|
|
2007-10-10 01:01:31 +00:00
|
|
|
// This is used in the ppcf128->int sequence. Note it has different semantics
|
|
|
|
// from FP_ROUND: that rounds to nearest, this rounds to zero.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
|
2007-10-06 01:24:11 +00:00
|
|
|
|
2012-08-16 18:19:29 +00:00
|
|
|
// We do not currently implement these libm ops for PowerPC.
|
2011-12-08 19:32:14 +00:00
|
|
|
setOperationAction(ISD::FFLOOR, MVT::ppcf128, Expand);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::ppcf128, Expand);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::ppcf128, Expand);
|
|
|
|
setOperationAction(ISD::FRINT, MVT::ppcf128, Expand);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, MVT::ppcf128, Expand);
|
2013-04-03 13:05:44 +00:00
|
|
|
setOperationAction(ISD::FREM, MVT::ppcf128, Expand);
|
2011-12-08 19:32:14 +00:00
|
|
|
|
2005-08-16 17:14:42 +00:00
|
|
|
// PowerPC has no SREM/UREM instructions
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SREM, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::UREM, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SREM, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::UREM, MVT::i64, Expand);
|
2007-10-08 17:28:24 +00:00
|
|
|
|
|
|
|
// Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-10-11 23:21:31 +00:00
|
|
|
// We don't support sin/cos/sqrt/fmod/pow
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FSIN , MVT::f64, Expand);
|
|
|
|
setOperationAction(ISD::FCOS , MVT::f64, Expand);
|
2013-01-29 02:32:37 +00:00
|
|
|
setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FREM , MVT::f64, Expand);
|
|
|
|
setOperationAction(ISD::FPOW , MVT::f64, Expand);
|
2012-06-22 00:49:52 +00:00
|
|
|
setOperationAction(ISD::FMA , MVT::f64, Legal);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FSIN , MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::FCOS , MVT::f32, Expand);
|
2013-01-29 02:32:37 +00:00
|
|
|
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FREM , MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::FPOW , MVT::f32, Expand);
|
2012-06-22 00:49:52 +00:00
|
|
|
setOperationAction(ISD::FMA , MVT::f32, Legal);
|
2008-01-18 19:55:37 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2005-08-16 17:14:42 +00:00
|
|
|
// If we're enabling GP optimizations, use hardware square root
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!Subtarget.hasFSQRT() &&
|
2013-04-03 04:01:11 +00:00
|
|
|
!(TM.Options.UnsafeFPMath &&
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget.hasFRSQRTE() && Subtarget.hasFRE()))
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FSQRT, MVT::f64, Expand);
|
2013-04-03 04:01:11 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!Subtarget.hasFSQRT() &&
|
2013-04-03 04:01:11 +00:00
|
|
|
!(TM.Options.UnsafeFPMath &&
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget.hasFRSQRTES() && Subtarget.hasFRES()))
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FSQRT, MVT::f32, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasFCPSGN()) {
|
2013-08-19 05:01:02 +00:00
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Legal);
|
|
|
|
} else {
|
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
|
|
|
|
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasFPRND()) {
|
2013-03-29 08:57:48 +00:00
|
|
|
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
2013-08-08 04:31:34 +00:00
|
|
|
setOperationAction(ISD::FROUND, MVT::f64, Legal);
|
2013-03-29 08:57:48 +00:00
|
|
|
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
|
2013-08-08 04:31:34 +00:00
|
|
|
setOperationAction(ISD::FROUND, MVT::f32, Legal);
|
2013-03-29 08:57:48 +00:00
|
|
|
}
|
|
|
|
|
2006-01-14 03:14:10 +00:00
|
|
|
// PowerPC does not have BSWAP, CTPOP or CTTZ
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::BSWAP, MVT::i32 , Expand);
|
|
|
|
setOperationAction(ISD::CTTZ , MVT::i32 , Expand);
|
2011-12-13 01:56:10 +00:00
|
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::BSWAP, MVT::i64 , Expand);
|
|
|
|
setOperationAction(ISD::CTTZ , MVT::i64 , Expand);
|
2011-12-13 01:56:10 +00:00
|
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasPOPCNTD()) {
|
2013-04-01 15:58:15 +00:00
|
|
|
setOperationAction(ISD::CTPOP, MVT::i32 , Legal);
|
2013-03-28 13:29:47 +00:00
|
|
|
setOperationAction(ISD::CTPOP, MVT::i64 , Legal);
|
|
|
|
} else {
|
|
|
|
setOperationAction(ISD::CTPOP, MVT::i32 , Expand);
|
|
|
|
setOperationAction(ISD::CTPOP, MVT::i64 , Expand);
|
|
|
|
}
|
|
|
|
|
2006-01-11 21:21:00 +00:00
|
|
|
// PowerPC does not have ROTR
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::ROTR, MVT::i32 , Expand);
|
|
|
|
setOperationAction(ISD::ROTR, MVT::i64 , Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!Subtarget.useCRBits()) {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
// PowerPC does not have Select
|
|
|
|
setOperationAction(ISD::SELECT, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::SELECT, MVT::f64, Expand);
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2005-08-26 17:36:52 +00:00
|
|
|
// PowerPC wants to turn select_cc of FP into fsel when possible.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
|
|
|
|
setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
|
Codegen
bool %test(int %X) {
%Y = seteq int %X, 13
ret bool %Y
}
as
_test:
addi r2, r3, -13
cntlzw r2, r2
srwi r3, r2, 5
blr
rather than
_test:
cmpwi cr7, r3, 13
mfcr r2
rlwinm r3, r2, 31, 31, 31
blr
This has very little effect on most code, but speeds up analyzer 23% and
mason 11%
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@25848 91177308-0d34-0410-b5e6-96231b3b80d8
2006-01-31 08:17:29 +00:00
|
|
|
|
2006-02-01 07:19:44 +00:00
|
|
|
// PowerPC wants to optimize integer setcc a bit
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!Subtarget.useCRBits())
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setOperationAction(ISD::SETCC, MVT::i32, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-17 01:40:33 +00:00
|
|
|
// PowerPC does not have BRCOND which requires SetCC
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!Subtarget.useCRBits())
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setOperationAction(ISD::BRCOND, MVT::Other, Expand);
|
2006-10-30 08:02:39 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::BR_JT, MVT::Other, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2005-08-31 21:09:52 +00:00
|
|
|
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
|
2005-09-06 22:03:27 +00:00
|
|
|
|
2005-08-17 00:40:22 +00:00
|
|
|
// PowerPC does not have [U|S]INT_TO_FP
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
|
2005-08-17 00:40:22 +00:00
|
|
|
|
2010-11-23 03:31:01 +00:00
|
|
|
setOperationAction(ISD::BITCAST, MVT::f32, Expand);
|
|
|
|
setOperationAction(ISD::BITCAST, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::BITCAST, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::BITCAST, MVT::f64, Expand);
|
2005-12-23 05:13:35 +00:00
|
|
|
|
2006-04-28 21:56:10 +00:00
|
|
|
// We cannot sextinreg(i1). Expand to shifts.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
|
2007-02-22 14:56:36 +00:00
|
|
|
|
2013-03-27 19:10:42 +00:00
|
|
|
// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
|
2013-03-21 21:37:52 +00:00
|
|
|
// SjLj exception handling but a light-weight setjmp/longjmp replacement to
|
|
|
|
// support continuation, user-level threading, and etc.. As a result, no
|
|
|
|
// other SjLj exception interfaces are implemented and please don't build
|
|
|
|
// your own exception handling based on them.
|
|
|
|
// LLVM/Clang supports zero-cost DWARF exception handling.
|
|
|
|
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
// We want to legalize GlobalAddress and ConstantPool nodes into the
|
2005-12-10 02:36:00 +00:00
|
|
|
// appropriate instructions to materialize the address.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
|
2009-11-04 21:31:18 +00:00
|
|
|
setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::ConstantPool, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::JumpTable, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
|
2009-11-04 21:31:18 +00:00
|
|
|
setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::JumpTable, MVT::i64, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-08-11 17:36:31 +00:00
|
|
|
// TRAP is legal.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::TRAP, MVT::Other, Legal);
|
2008-09-17 00:30:57 +00:00
|
|
|
|
|
|
|
// TRAMPOLINE is custom lowered.
|
2011-09-06 13:37:06 +00:00
|
|
|
setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
|
|
|
|
setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
|
2008-09-17 00:30:57 +00:00
|
|
|
|
2006-01-25 18:21:52 +00:00
|
|
|
// VASTART needs to be custom lowered to use the VarArgsFrameIndex
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::VASTART , MVT::Other, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI()) {
|
2012-07-02 22:39:56 +00:00
|
|
|
if (isPPC64) {
|
2012-03-24 03:53:55 +00:00
|
|
|
// VAARG always uses double-word chunks, so promote anything smaller.
|
|
|
|
setOperationAction(ISD::VAARG, MVT::i1, Promote);
|
|
|
|
AddPromotedToType (ISD::VAARG, MVT::i1, MVT::i64);
|
|
|
|
setOperationAction(ISD::VAARG, MVT::i8, Promote);
|
|
|
|
AddPromotedToType (ISD::VAARG, MVT::i8, MVT::i64);
|
|
|
|
setOperationAction(ISD::VAARG, MVT::i16, Promote);
|
|
|
|
AddPromotedToType (ISD::VAARG, MVT::i16, MVT::i64);
|
|
|
|
setOperationAction(ISD::VAARG, MVT::i32, Promote);
|
|
|
|
AddPromotedToType (ISD::VAARG, MVT::i32, MVT::i64);
|
|
|
|
setOperationAction(ISD::VAARG, MVT::Other, Expand);
|
|
|
|
} else {
|
|
|
|
// VAARG is custom lowered with the 32-bit SVR4 ABI.
|
|
|
|
setOperationAction(ISD::VAARG, MVT::Other, Custom);
|
|
|
|
setOperationAction(ISD::VAARG, MVT::i64, Custom);
|
|
|
|
}
|
2011-06-28 15:30:42 +00:00
|
|
|
} else
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::VAARG, MVT::Other, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI() && !isPPC64)
|
2013-07-25 21:36:47 +00:00
|
|
|
// VACOPY is custom lowered with the 32-bit SVR4 ABI.
|
|
|
|
setOperationAction(ISD::VACOPY , MVT::Other, Custom);
|
|
|
|
else
|
|
|
|
setOperationAction(ISD::VACOPY , MVT::Other, Expand);
|
|
|
|
|
2006-01-15 09:02:48 +00:00
|
|
|
// Use the default implementation.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::VAEND , MVT::Other, Expand);
|
|
|
|
setOperationAction(ISD::STACKSAVE , MVT::Other, Expand);
|
|
|
|
setOperationAction(ISD::STACKRESTORE , MVT::Other, Custom);
|
|
|
|
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32 , Custom);
|
|
|
|
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64 , Custom);
|
2006-10-18 01:18:48 +00:00
|
|
|
|
2006-03-26 10:06:40 +00:00
|
|
|
// We want to custom lower some of our intrinsics.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
// To handle counter-based loop conditions.
|
|
|
|
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i1, Custom);
|
|
|
|
|
2008-11-07 22:54:33 +00:00
|
|
|
// Comparisons that require checking two conditions.
|
2009-08-11 20:47:22 +00:00
|
|
|
setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.has64BitSupport()) {
|
2005-10-18 00:28:58 +00:00
|
|
|
// They also have instructions for converting between i64 and fp.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
|
2009-06-04 20:53:52 +00:00
|
|
|
// This is just the low 32 bits of a (signed) fp->i64 conversion.
|
|
|
|
// We cannot do this with Promote because i64 is not a legal type.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasLFIWAX() || Subtarget.isPPC64())
|
2013-03-31 01:58:02 +00:00
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
|
2005-10-25 23:48:36 +00:00
|
|
|
} else {
|
2005-11-17 07:30:41 +00:00
|
|
|
// PowerPC does not have FP_TO_UINT on 32-bit implementations.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
|
2005-10-18 00:56:42 +00:00
|
|
|
}
|
|
|
|
|
2013-04-01 17:52:07 +00:00
|
|
|
// With the instructions enabled under FPCVT, we can do everything.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasFPCVT()) {
|
|
|
|
if (Subtarget.has64BitSupport()) {
|
2013-04-01 17:52:07 +00:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
|
|
|
|
}
|
|
|
|
|
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
|
|
|
|
}
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.use64BitRegs()) {
|
2007-10-19 04:08:28 +00:00
|
|
|
// 64-bit PowerPC implementations can support i64 types directly
|
2012-04-20 06:31:50 +00:00
|
|
|
addRegisterClass(MVT::i64, &PPC::G8RCRegClass);
|
2005-10-18 00:28:58 +00:00
|
|
|
// BUILD_PAIR can't be handled natively, and should be expanded to shl/or
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
|
2008-03-07 20:36:53 +00:00
|
|
|
// 64-bit PowerPC wants to expand i128 shifts itself.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
|
|
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
|
2005-10-18 00:28:58 +00:00
|
|
|
} else {
|
2007-10-19 04:08:28 +00:00
|
|
|
// 32-bit PowerPC wants to expand i64 shifts itself.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
|
|
|
|
setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
|
2005-09-06 22:03:27 +00:00
|
|
|
}
|
2006-03-01 01:11:20 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasAltivec()) {
|
2006-03-31 19:52:36 +00:00
|
|
|
// First set operation action for all vector types to expand. Then we
|
|
|
|
// will selectively turn on ones that can be effectively codegen'd.
|
2009-08-11 20:47:22 +00:00
|
|
|
for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
|
|
|
|
i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
|
|
|
|
MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
|
2008-06-06 12:08:01 +00:00
|
|
|
|
2006-04-16 01:37:57 +00:00
|
|
|
// add/sub are legal for all supported vector VT's.
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::ADD , VT, Legal);
|
|
|
|
setOperationAction(ISD::SUB , VT, Legal);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-04 17:25:31 +00:00
|
|
|
// We promote all shuffles to v16i8.
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
|
2006-04-16 01:37:57 +00:00
|
|
|
|
|
|
|
// We promote all non-typed operations to v4i32.
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::AND , VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::AND , VT, MVT::v4i32);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::OR , VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::OR , VT, MVT::v4i32);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::XOR , VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::XOR , VT, MVT::v4i32);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::LOAD , VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::LOAD , VT, MVT::v4i32);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::SELECT, VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::STORE, VT, Promote);
|
2009-08-11 20:47:22 +00:00
|
|
|
AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-16 01:37:57 +00:00
|
|
|
// No other operations are legal.
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::MUL , VT, Expand);
|
|
|
|
setOperationAction(ISD::SDIV, VT, Expand);
|
|
|
|
setOperationAction(ISD::SREM, VT, Expand);
|
|
|
|
setOperationAction(ISD::UDIV, VT, Expand);
|
|
|
|
setOperationAction(ISD::UREM, VT, Expand);
|
|
|
|
setOperationAction(ISD::FDIV, VT, Expand);
|
2013-07-08 17:30:25 +00:00
|
|
|
setOperationAction(ISD::FREM, VT, Expand);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::FNEG, VT, Expand);
|
2012-11-15 08:02:19 +00:00
|
|
|
setOperationAction(ISD::FSQRT, VT, Expand);
|
|
|
|
setOperationAction(ISD::FLOG, VT, Expand);
|
|
|
|
setOperationAction(ISD::FLOG10, VT, Expand);
|
|
|
|
setOperationAction(ISD::FLOG2, VT, Expand);
|
|
|
|
setOperationAction(ISD::FEXP, VT, Expand);
|
|
|
|
setOperationAction(ISD::FEXP2, VT, Expand);
|
|
|
|
setOperationAction(ISD::FSIN, VT, Expand);
|
|
|
|
setOperationAction(ISD::FCOS, VT, Expand);
|
|
|
|
setOperationAction(ISD::FABS, VT, Expand);
|
|
|
|
setOperationAction(ISD::FPOWI, VT, Expand);
|
2012-11-14 08:11:25 +00:00
|
|
|
setOperationAction(ISD::FFLOOR, VT, Expand);
|
2012-11-15 06:51:10 +00:00
|
|
|
setOperationAction(ISD::FCEIL, VT, Expand);
|
|
|
|
setOperationAction(ISD::FTRUNC, VT, Expand);
|
|
|
|
setOperationAction(ISD::FRINT, VT, Expand);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, VT, Expand);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
|
|
|
|
setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
|
|
|
|
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
|
|
|
|
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
|
|
|
|
setOperationAction(ISD::UDIVREM, VT, Expand);
|
|
|
|
setOperationAction(ISD::SDIVREM, VT, Expand);
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
|
|
|
|
setOperationAction(ISD::FPOW, VT, Expand);
|
2014-05-19 13:12:38 +00:00
|
|
|
setOperationAction(ISD::BSWAP, VT, Expand);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::CTPOP, VT, Expand);
|
|
|
|
setOperationAction(ISD::CTLZ, VT, Expand);
|
2011-12-13 01:56:10 +00:00
|
|
|
setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
|
2008-06-06 12:08:01 +00:00
|
|
|
setOperationAction(ISD::CTTZ, VT, Expand);
|
2011-12-13 01:56:10 +00:00
|
|
|
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
|
2012-12-19 15:49:14 +00:00
|
|
|
setOperationAction(ISD::VSELECT, VT, Expand);
|
2012-11-05 17:15:56 +00:00
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);
|
|
|
|
|
|
|
|
for (unsigned j = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
|
|
|
|
j <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++j) {
|
|
|
|
MVT::SimpleValueType InnerVT = (MVT::SimpleValueType)j;
|
|
|
|
setTruncStoreAction(VT, InnerVT, Expand);
|
|
|
|
}
|
|
|
|
setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
|
|
|
|
setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
|
|
|
|
setLoadExtAction(ISD::EXTLOAD, VT, Expand);
|
2006-03-31 19:52:36 +00:00
|
|
|
}
|
|
|
|
|
2006-04-04 17:25:31 +00:00
|
|
|
// We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
|
|
|
|
// with merges, splats, etc.
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
|
|
|
|
|
|
|
|
setOperationAction(ISD::AND , MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::OR , MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::XOR , MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::LOAD , MVT::v4i32, Legal);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setOperationAction(ISD::SELECT, MVT::v4i32,
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget.useCRBits() ? Legal : Expand);
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::STORE , MVT::v4i32, Legal);
|
2012-10-08 17:27:24 +00:00
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
|
2012-11-15 20:56:03 +00:00
|
|
|
setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
|
2009-08-11 20:47:22 +00:00
|
|
|
|
2012-04-20 06:31:50 +00:00
|
|
|
addRegisterClass(MVT::v4f32, &PPC::VRRCRegClass);
|
|
|
|
addRegisterClass(MVT::v4i32, &PPC::VRRCRegClass);
|
|
|
|
addRegisterClass(MVT::v8i16, &PPC::VRRCRegClass);
|
|
|
|
addRegisterClass(MVT::v16i8, &PPC::VRRCRegClass);
|
2009-08-11 20:47:22 +00:00
|
|
|
|
|
|
|
setOperationAction(ISD::MUL, MVT::v4f32, Legal);
|
2012-06-22 00:49:52 +00:00
|
|
|
setOperationAction(ISD::FMA, MVT::v4f32, Legal);
|
2013-04-03 04:01:11 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (TM.Options.UnsafeFPMath || Subtarget.hasVSX()) {
|
2013-04-03 04:01:11 +00:00
|
|
|
setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::FSQRT, MVT::v4f32, Legal);
|
|
|
|
}
|
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v8i16, Custom);
|
|
|
|
setOperationAction(ISD::MUL, MVT::v16i8, Custom);
|
|
|
|
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
|
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
|
|
|
|
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
|
|
|
|
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
|
2012-10-30 13:50:19 +00:00
|
|
|
|
|
|
|
// Altivec does not contain unordered floating-point compare instructions
|
|
|
|
setCondCodeAction(ISD::SETUO, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUEQ, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGT, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGE, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETULT, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETULE, MVT::v4f32, Expand);
|
2013-07-08 20:00:03 +00:00
|
|
|
|
|
|
|
setCondCodeAction(ISD::SETO, MVT::v4f32, Expand);
|
|
|
|
setCondCodeAction(ISD::SETONE, MVT::v4f32, Expand);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasVSX()) {
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
|
2014-03-27 22:22:48 +00:00
|
|
|
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Legal);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
|
|
|
|
setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::MUL, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FMA, MVT::v2f64, Legal);
|
|
|
|
|
|
|
|
setOperationAction(ISD::FDIV, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::FSQRT, MVT::v2f64, Legal);
|
|
|
|
|
2014-03-26 12:49:28 +00:00
|
|
|
setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
|
|
|
|
setOperationAction(ISD::VSELECT, MVT::v8i16, Legal);
|
|
|
|
setOperationAction(ISD::VSELECT, MVT::v4i32, Legal);
|
|
|
|
setOperationAction(ISD::VSELECT, MVT::v4f32, Legal);
|
|
|
|
setOperationAction(ISD::VSELECT, MVT::v2f64, Legal);
|
|
|
|
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
// Share the Altivec comparison restrictions.
|
|
|
|
setCondCodeAction(ISD::SETUO, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUEQ, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGT, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETUGE, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETULT, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETULE, MVT::v2f64, Expand);
|
|
|
|
|
|
|
|
setCondCodeAction(ISD::SETO, MVT::v2f64, Expand);
|
|
|
|
setCondCodeAction(ISD::SETONE, MVT::v2f64, Expand);
|
|
|
|
|
2014-03-26 18:26:30 +00:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v2f64, Legal);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v2f64, Legal);
|
|
|
|
|
2014-03-26 22:58:37 +00:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Legal);
|
|
|
|
|
2014-03-29 05:29:01 +00:00
|
|
|
addRegisterClass(MVT::f64, &PPC::VSFRCRegClass);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
|
|
|
|
addRegisterClass(MVT::v4f32, &PPC::VSRCRegClass);
|
|
|
|
addRegisterClass(MVT::v2f64, &PPC::VSRCRegClass);
|
2014-03-26 16:12:58 +00:00
|
|
|
|
|
|
|
// VSX v2i64 only supports non-arithmetic operations.
|
|
|
|
setOperationAction(ISD::ADD, MVT::v2i64, Expand);
|
|
|
|
setOperationAction(ISD::SUB, MVT::v2i64, Expand);
|
|
|
|
|
2014-03-27 21:26:33 +00:00
|
|
|
setOperationAction(ISD::SHL, MVT::v2i64, Expand);
|
|
|
|
setOperationAction(ISD::SRA, MVT::v2i64, Expand);
|
|
|
|
setOperationAction(ISD::SRL, MVT::v2i64, Expand);
|
|
|
|
|
2014-03-29 16:04:40 +00:00
|
|
|
setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
|
|
|
|
|
2014-03-26 18:26:30 +00:00
|
|
|
setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
|
|
|
|
AddPromotedToType (ISD::LOAD, MVT::v2i64, MVT::v2f64);
|
|
|
|
setOperationAction(ISD::STORE, MVT::v2i64, Promote);
|
|
|
|
AddPromotedToType (ISD::STORE, MVT::v2i64, MVT::v2f64);
|
|
|
|
|
2014-03-26 22:58:37 +00:00
|
|
|
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Legal);
|
|
|
|
|
2014-03-26 19:13:54 +00:00
|
|
|
setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
|
|
|
|
setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
|
|
|
|
setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
|
|
|
|
setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
|
|
|
|
|
2014-03-30 13:22:59 +00:00
|
|
|
// Vector operation legalization checks the result type of
|
|
|
|
// SIGN_EXTEND_INREG, overall legalization checks the inner type.
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Legal);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Legal);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
|
|
|
|
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
|
|
|
|
|
2014-03-26 16:12:58 +00:00
|
|
|
addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
}
|
2005-11-29 08:17:20 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.has64BitSupport()) {
|
2012-04-01 20:08:17 +00:00
|
|
|
setOperationAction(ISD::PREFETCH, MVT::Other, Legal);
|
2012-08-04 14:10:46 +00:00
|
|
|
setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
|
|
|
|
}
|
2012-04-01 20:08:17 +00:00
|
|
|
|
2011-08-29 18:23:02 +00:00
|
|
|
setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
|
|
|
|
setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
|
2012-12-25 17:22:53 +00:00
|
|
|
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Expand);
|
|
|
|
setOperationAction(ISD::ATOMIC_STORE, MVT::i64, Expand);
|
2011-08-29 18:23:02 +00:00
|
|
|
|
2008-11-23 15:47:28 +00:00
|
|
|
setBooleanContents(ZeroOrOneBooleanContent);
|
2013-04-23 18:49:44 +00:00
|
|
|
// Altivec instructions set fields to all zeros or all ones.
|
|
|
|
setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2012-07-02 22:39:56 +00:00
|
|
|
if (isPPC64) {
|
2006-10-18 01:20:43 +00:00
|
|
|
setStackPointerRegisterToSaveRestore(PPC::X1);
|
2007-02-22 14:56:36 +00:00
|
|
|
setExceptionPointerRegister(PPC::X3);
|
|
|
|
setExceptionSelectorRegister(PPC::X4);
|
|
|
|
} else {
|
2006-10-18 01:20:43 +00:00
|
|
|
setStackPointerRegisterToSaveRestore(PPC::R1);
|
2007-02-22 14:56:36 +00:00
|
|
|
setExceptionPointerRegister(PPC::R3);
|
|
|
|
setExceptionSelectorRegister(PPC::R4);
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-01 04:57:39 +00:00
|
|
|
// We have target-specific dag combine patterns for the following nodes:
|
|
|
|
setTargetDAGCombine(ISD::SINT_TO_FP);
|
2013-05-24 23:00:14 +00:00
|
|
|
setTargetDAGCombine(ISD::LOAD);
|
2006-03-01 05:50:56 +00:00
|
|
|
setTargetDAGCombine(ISD::STORE);
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
setTargetDAGCombine(ISD::BR_CC);
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.useCRBits())
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setTargetDAGCombine(ISD::BRCOND);
|
2006-07-10 20:56:58 +00:00
|
|
|
setTargetDAGCombine(ISD::BSWAP);
|
2013-05-25 04:05:05 +00:00
|
|
|
setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
setTargetDAGCombine(ISD::SIGN_EXTEND);
|
|
|
|
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
|
|
|
setTargetDAGCombine(ISD::ANY_EXTEND);
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.useCRBits()) {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setTargetDAGCombine(ISD::TRUNCATE);
|
|
|
|
setTargetDAGCombine(ISD::SETCC);
|
|
|
|
setTargetDAGCombine(ISD::SELECT_CC);
|
|
|
|
}
|
|
|
|
|
2013-04-03 04:01:11 +00:00
|
|
|
// Use reciprocal estimates.
|
|
|
|
if (TM.Options.UnsafeFPMath) {
|
|
|
|
setTargetDAGCombine(ISD::FDIV);
|
|
|
|
setTargetDAGCombine(ISD::FSQRT);
|
|
|
|
}
|
|
|
|
|
2007-10-19 00:59:18 +00:00
|
|
|
// Darwin long double math library functions have $LDBL128 appended.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isDarwin()) {
|
2008-01-10 10:28:30 +00:00
|
|
|
setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
|
2007-10-19 00:59:18 +00:00
|
|
|
setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
|
|
|
|
setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
|
2008-01-10 10:28:30 +00:00
|
|
|
setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
|
|
|
|
setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
|
2008-09-04 00:47:13 +00:00
|
|
|
setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
|
|
|
|
setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
|
|
|
|
setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
|
|
|
|
setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
|
|
|
|
setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
|
2007-10-19 00:59:18 +00:00
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
// With 32 condition bits, we don't need to sink (and duplicate) compares
|
|
|
|
// aggressively in CodeGenPrep.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.useCRBits())
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
setHasMultipleConditionRegisters();
|
|
|
|
|
2011-10-17 18:53:03 +00:00
|
|
|
setMinFunctionAlignment(2);
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isDarwin())
|
2011-10-17 18:53:03 +00:00
|
|
|
setPrefFunctionAlignment(4);
|
2011-05-06 20:34:06 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (isPPC64 && Subtarget.isJITCodeModel())
|
2012-07-02 22:39:56 +00:00
|
|
|
// Temporary workaround for the inability of PPC64 JIT to handle jump
|
|
|
|
// tables.
|
|
|
|
setSupportJumpTables(false);
|
|
|
|
|
2011-08-03 21:06:02 +00:00
|
|
|
setInsertFencesForAtomic(true);
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.enableMachineScheduler())
|
2013-09-11 23:05:25 +00:00
|
|
|
setSchedulingPreference(Sched::Source);
|
|
|
|
else
|
|
|
|
setSchedulingPreference(Sched::Hybrid);
|
2011-11-22 16:21:04 +00:00
|
|
|
|
2005-08-16 17:14:42 +00:00
|
|
|
computeRegisterProperties();
|
2012-08-28 16:12:39 +00:00
|
|
|
|
|
|
|
// The Freescale cores does better with aggressive inlining of memcpy and
|
|
|
|
// friends. Gcc uses same threshold of 128 bytes (= 32 word stores).
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.getDarwinDirective() == PPC::DIR_E500mc ||
|
|
|
|
Subtarget.getDarwinDirective() == PPC::DIR_E5500) {
|
2013-02-20 21:13:59 +00:00
|
|
|
MaxStoresPerMemset = 32;
|
|
|
|
MaxStoresPerMemsetOptSize = 16;
|
|
|
|
MaxStoresPerMemcpy = 32;
|
|
|
|
MaxStoresPerMemcpyOptSize = 8;
|
|
|
|
MaxStoresPerMemmove = 32;
|
|
|
|
MaxStoresPerMemmoveOptSize = 8;
|
2012-08-28 16:12:39 +00:00
|
|
|
|
|
|
|
setPrefFunctionAlignment(4);
|
|
|
|
}
|
2005-08-16 17:14:42 +00:00
|
|
|
}
|
|
|
|
|
2013-09-12 23:20:06 +00:00
|
|
|
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
|
|
|
|
/// the desired ByVal argument alignment.
|
|
|
|
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
|
|
|
|
unsigned MaxMaxAlign) {
|
|
|
|
if (MaxAlign == MaxMaxAlign)
|
|
|
|
return;
|
|
|
|
if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
|
|
|
|
if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
|
|
|
|
MaxAlign = 32;
|
|
|
|
else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
|
|
|
|
MaxAlign = 16;
|
|
|
|
} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
|
|
|
|
unsigned EltAlign = 0;
|
|
|
|
getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
|
|
|
|
if (EltAlign > MaxAlign)
|
|
|
|
MaxAlign = EltAlign;
|
|
|
|
} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
|
|
|
|
for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
|
|
|
|
unsigned EltAlign = 0;
|
|
|
|
getMaxByValAlign(STy->getElementType(i), EltAlign, MaxMaxAlign);
|
|
|
|
if (EltAlign > MaxAlign)
|
|
|
|
MaxAlign = EltAlign;
|
|
|
|
if (MaxAlign == MaxMaxAlign)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2008-02-28 22:31:51 +00:00
|
|
|
/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
|
|
|
|
/// function arguments in the caller parameter area.
|
2011-07-18 04:54:35 +00:00
|
|
|
unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
|
2008-02-28 22:31:51 +00:00
|
|
|
// Darwin passes everything on 4 byte boundary.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isDarwin())
|
2008-02-28 22:31:51 +00:00
|
|
|
return 4;
|
2012-04-02 15:49:30 +00:00
|
|
|
|
|
|
|
// 16byte and wider vectors are passed on 16byte boundary.
|
|
|
|
// The rest is 8 on PPC64 and 4 on PPC32 boundary.
|
2014-06-12 22:38:18 +00:00
|
|
|
unsigned Align = Subtarget.isPPC64() ? 8 : 4;
|
|
|
|
if (Subtarget.hasAltivec() || Subtarget.hasQPX())
|
|
|
|
getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
|
2013-09-12 23:20:06 +00:00
|
|
|
return Align;
|
2008-02-28 22:31:51 +00:00
|
|
|
}
|
|
|
|
|
2006-01-09 23:52:17 +00:00
|
|
|
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|
|
|
switch (Opcode) {
|
2014-04-25 05:30:21 +00:00
|
|
|
default: return nullptr;
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::FSEL: return "PPCISD::FSEL";
|
|
|
|
case PPCISD::FCFID: return "PPCISD::FCFID";
|
|
|
|
case PPCISD::FCTIDZ: return "PPCISD::FCTIDZ";
|
|
|
|
case PPCISD::FCTIWZ: return "PPCISD::FCTIWZ";
|
2013-04-03 04:01:11 +00:00
|
|
|
case PPCISD::FRE: return "PPCISD::FRE";
|
|
|
|
case PPCISD::FRSQRTE: return "PPCISD::FRSQRTE";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::STFIWX: return "PPCISD::STFIWX";
|
|
|
|
case PPCISD::VMADDFP: return "PPCISD::VMADDFP";
|
|
|
|
case PPCISD::VNMSUBFP: return "PPCISD::VNMSUBFP";
|
|
|
|
case PPCISD::VPERM: return "PPCISD::VPERM";
|
|
|
|
case PPCISD::Hi: return "PPCISD::Hi";
|
|
|
|
case PPCISD::Lo: return "PPCISD::Lo";
|
2009-08-15 11:54:46 +00:00
|
|
|
case PPCISD::TOC_ENTRY: return "PPCISD::TOC_ENTRY";
|
2009-12-18 13:00:15 +00:00
|
|
|
case PPCISD::LOAD: return "PPCISD::LOAD";
|
|
|
|
case PPCISD::LOAD_TOC: return "PPCISD::LOAD_TOC";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::DYNALLOC: return "PPCISD::DYNALLOC";
|
|
|
|
case PPCISD::GlobalBaseReg: return "PPCISD::GlobalBaseReg";
|
|
|
|
case PPCISD::SRL: return "PPCISD::SRL";
|
|
|
|
case PPCISD::SRA: return "PPCISD::SRA";
|
|
|
|
case PPCISD::SHL: return "PPCISD::SHL";
|
2013-03-22 15:24:13 +00:00
|
|
|
case PPCISD::CALL: return "PPCISD::CALL";
|
|
|
|
case PPCISD::CALL_NOP: return "PPCISD::CALL_NOP";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::MTCTR: return "PPCISD::MTCTR";
|
2013-03-22 15:24:13 +00:00
|
|
|
case PPCISD::BCTRL: return "PPCISD::BCTRL";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::RET_FLAG: return "PPCISD::RET_FLAG";
|
2013-03-21 21:37:52 +00:00
|
|
|
case PPCISD::EH_SJLJ_SETJMP: return "PPCISD::EH_SJLJ_SETJMP";
|
|
|
|
case PPCISD::EH_SJLJ_LONGJMP: return "PPCISD::EH_SJLJ_LONGJMP";
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185556 91177308-0d34-0410-b5e6-96231b3b80d8
2013-07-03 17:05:42 +00:00
|
|
|
case PPCISD::MFOCRF: return "PPCISD::MFOCRF";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::VCMP: return "PPCISD::VCMP";
|
|
|
|
case PPCISD::VCMPo: return "PPCISD::VCMPo";
|
|
|
|
case PPCISD::LBRX: return "PPCISD::LBRX";
|
|
|
|
case PPCISD::STBRX: return "PPCISD::STBRX";
|
|
|
|
case PPCISD::LARX: return "PPCISD::LARX";
|
|
|
|
case PPCISD::STCX: return "PPCISD::STCX";
|
|
|
|
case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH";
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
case PPCISD::BDNZ: return "PPCISD::BDNZ";
|
|
|
|
case PPCISD::BDZ: return "PPCISD::BDZ";
|
2008-07-12 02:23:19 +00:00
|
|
|
case PPCISD::MFFS: return "PPCISD::MFFS";
|
|
|
|
case PPCISD::FADDRTZ: return "PPCISD::FADDRTZ";
|
|
|
|
case PPCISD::TC_RETURN: return "PPCISD::TC_RETURN";
|
2012-08-28 02:10:27 +00:00
|
|
|
case PPCISD::CR6SET: return "PPCISD::CR6SET";
|
|
|
|
case PPCISD::CR6UNSET: return "PPCISD::CR6UNSET";
|
This patch implements medium code model support for 64-bit PowerPC.
The default for 64-bit PowerPC is small code model, in which TOC entries
must be addressable using a 16-bit offset from the TOC pointer. Additionally,
only TOC entries are addressed via the TOC pointer.
With medium code model, TOC entries and data sections can all be addressed
via the TOC pointer using a 32-bit offset. Cooperation with the linker
allows 16-bit offsets to be used when these are sufficient, reducing the
number of extra instructions that need to be executed. Medium code model
also does not generate explicit TOC entries in ".section toc" for variables
that are wholly internal to the compilation unit.
Consider a load of an external 4-byte integer. With small code model, the
compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
With medium model, it instead generates:
addis 3, 2, .LC1@toc@ha
ld 3, .LC1@toc@l(3)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc ei[TC],ei
Here .LC1@toc@ha is a relocation requesting the upper 16 bits of the
32-bit offset of ei's TOC entry from the TOC base pointer. Similarly,
.LC1@toc@l is a relocation requesting the lower 16 bits. Note that if
the linker determines that ei's TOC entry is within a 16-bit offset of
the TOC base pointer, it will replace the "addis" with a "nop", and
replace the "ld" with the identical "ld" instruction from the small
code model example.
Consider next a load of a function-scope static integer. For small code
model, the compiler generates:
ld 3, .LC1@toc(2)
lwz 4, 0(3)
.section .toc,"aw",@progbits
.LC1:
.tc test_fn_static.si[TC],test_fn_static.si
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
For medium code model, the compiler generates:
addis 3, 2, test_fn_static.si@toc@ha
addi 3, 3, test_fn_static.si@toc@l
lwz 4, 0(3)
.type test_fn_static.si,@object
.local test_fn_static.si
.comm test_fn_static.si,4,4
Again, the linker may replace the "addis" with a "nop", calculating only
a 16-bit offset when this is sufficient.
Note that it would be more efficient for the compiler to generate:
addis 3, 2, test_fn_static.si@toc@ha
lwz 4, test_fn_static.si@toc@l(3)
The current patch does not perform this optimization yet. This will be
addressed as a peephole optimization in a later patch.
For the moment, the default code model for 64-bit PowerPC will remain the
small code model. We plan to eventually change the default to medium code
model, which matches current upstream GCC behavior. Note that the different
code models are ABI-compatible, so code compiled with different models will
be linked and execute correctly.
I've tested the regression suite and the application/benchmark test suite in
two ways: Once with the patch as submitted here, and once with additional
logic to force medium code model as the default. The tests all compile
cleanly, with one exception. The mandel-2 application test fails due to an
unrelated ABI compatibility with passing complex numbers. It just so happens
that small code model was incredibly lucky, in that temporary values in
floating-point registers held the expected values needed by the external
library routine that was called incorrectly. My current thought is to correct
the ABI problems with _Complex before making medium code model the default,
to avoid introducing this "regression."
Here are a few comments on how the patch works, since the selection code
can be difficult to follow:
The existing logic for small code model defines three pseudo-instructions:
LDtoc for most uses, LDtocJTI for jump table addresses, and LDtocCPT for
constant pool addresses. These are expanded by SelectCodeCommon(). The
pseudo-instruction approach doesn't work for medium code model, because
we need to generate two instructions when we match the same pattern.
Instead, new logic in PPCDAGToDAGISel::Select() intercepts the TOC_ENTRY
node for medium code model, and generates an ADDIStocHA followed by either
a LDtocL or an ADDItocL. These new node types correspond naturally to
the sequences described above.
The addis/ld sequence is generated for the following cases:
* Jump table addresses
* Function addresses
* External global variables
* Tentative definitions of global variables (common linkage)
The addis/addi sequence is generated for the following cases:
* Constant pool entries
* File-scope static global variables
* Function-scope static variables
Expanding to the two-instruction sequences at select time exposes the
instructions to subsequent optimization, particularly scheduling.
The rest of the processing occurs at assembly time, in
PPCAsmPrinter::EmitInstruction. Each of the instructions is converted to
a "real" PowerPC instruction. When a TOC entry needs to be created, this
is done here in the same manner as for the existing LDtoc, LDtocJTI, and
LDtocCPT pseudo-instructions (I factored out a new routine to handle this).
I had originally thought that if a TOC entry was needed for LDtocL or
ADDItocL, it would already have been generated for the previous ADDIStocHA.
However, at higher optimization levels, the ADDIStocHA may appear in a
different block, which may be assembled textually following the block
containing the LDtocL or ADDItocL. So it is necessary to include the
possibility of creating a new TOC entry for those two instructions.
Note that for LDtocL, we generate a new form of LD called LDrs. This
allows specifying the @toc@l relocation for the offset field of the LD
instruction (i.e., the offset is replaced by a SymbolLo relocation).
When the peephole optimization described above is added, we will need
to do similar things for all immediate-form load and store operations.
The seven "mcm-n.ll" test cases are kept separate because otherwise the
intermingling of various TOC entries and so forth makes the tests fragile
and hard to understand.
The above assumes use of an external assembler. For use of the
integrated assembler, new relocations are added and used by
PPCELFObjectWriter. Testing is done with "mcm-obj.ll", which tests for
proper generation of the various relocations for the same sequences
tested with the external assembler.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@168708 91177308-0d34-0410-b5e6-96231b3b80d8
2012-11-27 17:35:46 +00:00
|
|
|
case PPCISD::ADDIS_TOC_HA: return "PPCISD::ADDIS_TOC_HA";
|
|
|
|
case PPCISD::LD_TOC_L: return "PPCISD::LD_TOC_L";
|
|
|
|
case PPCISD::ADDI_TOC_L: return "PPCISD::ADDI_TOC_L";
|
2013-12-20 18:08:54 +00:00
|
|
|
case PPCISD::PPC32_GOT: return "PPCISD::PPC32_GOT";
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170209 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-14 17:02:38 +00:00
|
|
|
case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
|
|
|
|
case PPCISD::LD_GOT_TPREL_L: return "PPCISD::LD_GOT_TPREL_L";
|
2012-12-04 16:18:08 +00:00
|
|
|
case PPCISD::ADD_TLS: return "PPCISD::ADD_TLS";
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169910 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-11 20:30:11 +00:00
|
|
|
case PPCISD::ADDIS_TLSGD_HA: return "PPCISD::ADDIS_TLSGD_HA";
|
|
|
|
case PPCISD::ADDI_TLSGD_L: return "PPCISD::ADDI_TLSGD_L";
|
|
|
|
case PPCISD::GET_TLS_ADDR: return "PPCISD::GET_TLS_ADDR";
|
2012-12-12 19:29:35 +00:00
|
|
|
case PPCISD::ADDIS_TLSLD_HA: return "PPCISD::ADDIS_TLSLD_HA";
|
|
|
|
case PPCISD::ADDI_TLSLD_L: return "PPCISD::ADDI_TLSLD_L";
|
|
|
|
case PPCISD::GET_TLSLD_ADDR: return "PPCISD::GET_TLSLD_ADDR";
|
|
|
|
case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
|
|
|
|
case PPCISD::ADDI_DTPREL_L: return "PPCISD::ADDI_DTPREL_L";
|
2013-02-20 15:50:31 +00:00
|
|
|
case PPCISD::VADD_SPLAT: return "PPCISD::VADD_SPLAT";
|
2013-05-14 19:35:45 +00:00
|
|
|
case PPCISD::SC: return "PPCISD::SC";
|
2006-01-09 23:52:17 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2013-05-18 00:21:46 +00:00
|
|
|
EVT PPCTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
|
2012-10-08 18:59:53 +00:00
|
|
|
if (!VT.isVector())
|
2014-06-12 22:38:18 +00:00
|
|
|
return Subtarget.useCRBits() ? MVT::i1 : MVT::i32;
|
2012-10-08 18:59:53 +00:00
|
|
|
return VT.changeVectorElementTypeToInteger();
|
2008-03-10 15:42:14 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Node matching predicates, for use by the tblgen matching code.
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2005-08-26 17:36:52 +00:00
|
|
|
/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
|
2008-07-27 21:46:04 +00:00
|
|
|
static bool isFloatingPointZero(SDValue Op) {
|
2005-08-26 17:36:52 +00:00
|
|
|
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
|
2007-08-31 04:03:46 +00:00
|
|
|
return CFP->getValueAPF().isZero();
|
2008-08-28 21:40:38 +00:00
|
|
|
else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
|
2005-08-26 17:36:52 +00:00
|
|
|
// Maybe this has already been legalized into the constant pool?
|
|
|
|
if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
|
2010-04-15 01:51:59 +00:00
|
|
|
if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
|
2007-08-31 04:03:46 +00:00
|
|
|
return CFP->getValueAPF().isZero();
|
2005-08-26 17:36:52 +00:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-04-06 17:23:16 +00:00
|
|
|
/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode. Return
|
|
|
|
/// true if Op is undef or if it matches the specified value.
|
2009-04-27 18:41:29 +00:00
|
|
|
static bool isConstantOrUndef(int Op, int Val) {
|
|
|
|
return Op < 0 || Op == Val;
|
2006-04-06 17:23:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
|
|
|
|
/// VPKUHUM instruction.
|
2014-06-10 14:35:01 +00:00
|
|
|
bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
|
|
|
|
SelectionDAG &DAG) {
|
|
|
|
unsigned j = DAG.getTarget().getDataLayout()->isLittleEndian() ? 0 : 1;
|
2006-04-06 22:28:36 +00:00
|
|
|
if (!isUnary) {
|
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
2014-06-10 14:35:01 +00:00
|
|
|
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j))
|
2006-04-06 22:28:36 +00:00
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
for (unsigned i = 0; i != 8; ++i)
|
2014-06-10 14:35:01 +00:00
|
|
|
if (!isConstantOrUndef(N->getMaskElt(i), i*2+j) ||
|
|
|
|
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j))
|
2006-04-06 22:28:36 +00:00
|
|
|
return false;
|
|
|
|
}
|
2006-04-06 18:26:28 +00:00
|
|
|
return true;
|
2006-04-06 17:23:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
|
|
|
|
/// VPKUWUM instruction.
|
2014-06-10 14:35:01 +00:00
|
|
|
bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary,
|
|
|
|
SelectionDAG &DAG) {
|
|
|
|
unsigned j, k;
|
|
|
|
if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
|
|
|
|
j = 0;
|
|
|
|
k = 1;
|
|
|
|
} else {
|
|
|
|
j = 2;
|
|
|
|
k = 3;
|
|
|
|
}
|
2006-04-06 22:28:36 +00:00
|
|
|
if (!isUnary) {
|
|
|
|
for (unsigned i = 0; i != 16; i += 2)
|
2014-06-10 14:35:01 +00:00
|
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
|
|
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+k))
|
2006-04-06 22:28:36 +00:00
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
for (unsigned i = 0; i != 8; i += 2)
|
2014-06-10 14:35:01 +00:00
|
|
|
if (!isConstantOrUndef(N->getMaskElt(i ), i*2+j) ||
|
|
|
|
!isConstantOrUndef(N->getMaskElt(i+1), i*2+k) ||
|
|
|
|
!isConstantOrUndef(N->getMaskElt(i+8), i*2+j) ||
|
|
|
|
!isConstantOrUndef(N->getMaskElt(i+9), i*2+k))
|
2006-04-06 22:28:36 +00:00
|
|
|
return false;
|
|
|
|
}
|
2006-04-06 18:26:28 +00:00
|
|
|
return true;
|
2006-04-06 17:23:16 +00:00
|
|
|
}
|
|
|
|
|
2006-04-06 22:02:42 +00:00
|
|
|
/// isVMerge - Common function, used to match vmrg* shuffles.
|
|
|
|
///
|
2009-04-27 18:41:29 +00:00
|
|
|
static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
|
2006-04-06 22:02:42 +00:00
|
|
|
unsigned LHSStart, unsigned RHSStart) {
|
2014-03-26 22:58:37 +00:00
|
|
|
if (N->getValueType(0) != MVT::v16i8)
|
|
|
|
return false;
|
2006-04-06 21:11:54 +00:00
|
|
|
assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
|
|
|
|
"Unsupported merge size!");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-06 21:11:54 +00:00
|
|
|
for (unsigned i = 0; i != 8/UnitSize; ++i) // Step over units
|
|
|
|
for (unsigned j = 0; j != UnitSize; ++j) { // Step over bytes within unit
|
2009-04-27 18:41:29 +00:00
|
|
|
if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
|
2006-04-06 22:02:42 +00:00
|
|
|
LHSStart+j+i*UnitSize) ||
|
2009-04-27 18:41:29 +00:00
|
|
|
!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
|
2006-04-06 22:02:42 +00:00
|
|
|
RHSStart+j+i*UnitSize))
|
2006-04-06 21:11:54 +00:00
|
|
|
return false;
|
|
|
|
}
|
2009-04-27 18:41:29 +00:00
|
|
|
return true;
|
2006-04-06 22:02:42 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
|
2014-06-10 14:35:01 +00:00
|
|
|
/// a VMRGL* instruction with the specified unit size (1,2 or 4 bytes).
|
2010-11-23 03:31:01 +00:00
|
|
|
bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
|
2014-06-10 14:35:01 +00:00
|
|
|
bool isUnary, SelectionDAG &DAG) {
|
|
|
|
if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
|
|
|
|
if (!isUnary)
|
|
|
|
return isVMerge(N, UnitSize, 0, 16);
|
|
|
|
return isVMerge(N, UnitSize, 0, 0);
|
|
|
|
} else {
|
|
|
|
if (!isUnary)
|
|
|
|
return isVMerge(N, UnitSize, 8, 24);
|
|
|
|
return isVMerge(N, UnitSize, 8, 8);
|
|
|
|
}
|
2006-04-06 21:11:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
|
2014-06-10 14:35:01 +00:00
|
|
|
/// a VMRGH* instruction with the specified unit size (1,2 or 4 bytes).
|
2010-11-23 03:31:01 +00:00
|
|
|
bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
|
2014-06-10 14:35:01 +00:00
|
|
|
bool isUnary, SelectionDAG &DAG) {
|
|
|
|
if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
|
|
|
|
if (!isUnary)
|
|
|
|
return isVMerge(N, UnitSize, 8, 24);
|
|
|
|
return isVMerge(N, UnitSize, 8, 8);
|
|
|
|
} else {
|
|
|
|
if (!isUnary)
|
|
|
|
return isVMerge(N, UnitSize, 0, 16);
|
|
|
|
return isVMerge(N, UnitSize, 0, 0);
|
|
|
|
}
|
2006-04-06 21:11:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-04-06 18:26:28 +00:00
|
|
|
/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
|
|
|
|
/// amount, otherwise return -1.
|
2014-06-10 14:35:01 +00:00
|
|
|
int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary, SelectionDAG &DAG) {
|
2014-03-26 22:58:37 +00:00
|
|
|
if (N->getValueType(0) != MVT::v16i8)
|
2014-04-08 19:00:27 +00:00
|
|
|
return -1;
|
2009-04-27 18:41:29 +00:00
|
|
|
|
|
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2006-04-06 18:26:28 +00:00
|
|
|
// Find the first non-undef value in the shuffle mask.
|
|
|
|
unsigned i;
|
2009-04-27 18:41:29 +00:00
|
|
|
for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
|
2006-04-06 18:26:28 +00:00
|
|
|
/*search*/;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-06 18:26:28 +00:00
|
|
|
if (i == 16) return -1; // all undef.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
// Otherwise, check to see if the rest of the elements are consecutively
|
2006-04-06 18:26:28 +00:00
|
|
|
// numbered from this value.
|
2009-04-27 18:41:29 +00:00
|
|
|
unsigned ShiftAmt = SVOp->getMaskElt(i);
|
2006-04-06 18:26:28 +00:00
|
|
|
if (ShiftAmt < i) return -1;
|
|
|
|
|
2014-06-10 14:35:01 +00:00
|
|
|
if (DAG.getTarget().getDataLayout()->isLittleEndian()) {
|
|
|
|
|
|
|
|
ShiftAmt += i;
|
|
|
|
|
|
|
|
if (!isUnary) {
|
|
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
|
|
for (++i; i != 16; ++i)
|
|
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt - i))
|
|
|
|
return -1;
|
|
|
|
} else {
|
|
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
|
|
for (++i; i != 16; ++i)
|
|
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt - i) & 15))
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
} else { // Big Endian
|
|
|
|
|
|
|
|
ShiftAmt -= i;
|
|
|
|
|
|
|
|
if (!isUnary) {
|
|
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
|
|
for (++i; i != 16; ++i)
|
|
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
|
|
|
|
return -1;
|
|
|
|
} else {
|
|
|
|
// Check the rest of the elements to see if they are consecutive.
|
|
|
|
for (++i; i != 16; ++i)
|
|
|
|
if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
|
|
|
|
return -1;
|
|
|
|
}
|
2006-04-06 22:28:36 +00:00
|
|
|
}
|
2006-04-06 18:26:28 +00:00
|
|
|
return ShiftAmt;
|
|
|
|
}
|
2006-03-20 06:33:01 +00:00
|
|
|
|
|
|
|
/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
|
|
|
|
/// specifies a splat of a single element that is suitable for input to
|
|
|
|
/// VSPLTB/VSPLTH/VSPLTW.
|
2009-04-27 18:41:29 +00:00
|
|
|
bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
|
2009-08-11 20:47:22 +00:00
|
|
|
assert(N->getValueType(0) == MVT::v16i8 &&
|
2006-04-04 17:25:31 +00:00
|
|
|
(EltSize == 1 || EltSize == 2 || EltSize == 4));
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-20 06:37:44 +00:00
|
|
|
// This is a splat operation if each element of the permute is the same, and
|
|
|
|
// if the value doesn't reference the second vector.
|
2009-04-27 18:41:29 +00:00
|
|
|
unsigned ElementBase = N->getMaskElt(0);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
// FIXME: Handle UNDEF elements too!
|
|
|
|
if (ElementBase >= 16)
|
2006-04-04 17:25:31 +00:00
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
// Check that the indices are consecutive, in the case of a multi-byte element
|
|
|
|
// splatted with a v16i8 mask.
|
|
|
|
for (unsigned i = 1; i != EltSize; ++i)
|
|
|
|
if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
|
2006-04-04 17:25:31 +00:00
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-04 17:25:31 +00:00
|
|
|
for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
|
2009-04-27 18:41:29 +00:00
|
|
|
if (N->getMaskElt(i) < 0) continue;
|
2006-04-04 17:25:31 +00:00
|
|
|
for (unsigned j = 0; j != EltSize; ++j)
|
2009-04-27 18:41:29 +00:00
|
|
|
if (N->getMaskElt(i+j) != N->getMaskElt(j))
|
2006-04-04 17:25:31 +00:00
|
|
|
return false;
|
2006-03-20 06:37:44 +00:00
|
|
|
}
|
2006-04-04 17:25:31 +00:00
|
|
|
return true;
|
2006-03-20 06:33:01 +00:00
|
|
|
}
|
|
|
|
|
2007-07-30 07:51:22 +00:00
|
|
|
/// isAllNegativeZeroVector - Returns true if all elements of build_vector
|
|
|
|
/// are -0.0.
|
|
|
|
bool PPC::isAllNegativeZeroVector(SDNode *N) {
|
2009-04-27 18:41:29 +00:00
|
|
|
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
|
|
|
|
|
|
|
|
APInt APVal, APUndef;
|
|
|
|
unsigned BitSize;
|
|
|
|
bool HasAnyUndefs;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-11-13 01:45:18 +00:00
|
|
|
if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32, true))
|
2009-04-27 18:41:29 +00:00
|
|
|
if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
|
2007-08-31 04:03:46 +00:00
|
|
|
return CFP->getValueAPF().isNegZero();
|
2009-04-27 18:41:29 +00:00
|
|
|
|
2007-07-30 07:51:22 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-03-20 06:33:01 +00:00
|
|
|
/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
|
|
|
|
/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
|
2014-06-10 14:35:01 +00:00
|
|
|
unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
|
|
|
|
SelectionDAG &DAG) {
|
2009-04-27 18:41:29 +00:00
|
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
|
|
|
|
assert(isSplatShuffleMask(SVOp, EltSize));
|
2014-06-10 14:35:01 +00:00
|
|
|
if (DAG.getTarget().getDataLayout()->isLittleEndian())
|
|
|
|
return (16 / EltSize) - 1 - (SVOp->getMaskElt(0) / EltSize);
|
|
|
|
else
|
|
|
|
return SVOp->getMaskElt(0) / EltSize;
|
2006-03-20 06:33:01 +00:00
|
|
|
}
|
|
|
|
|
2006-04-12 17:37:20 +00:00
|
|
|
/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
|
2006-04-08 06:46:53 +00:00
|
|
|
/// by using a vspltis[bhw] instruction of the specified element size, return
|
|
|
|
/// the constant being splatted. The ByteSize field indicates the number of
|
|
|
|
/// bytes of each element [124] -> [bhw].
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
|
2014-04-25 05:30:21 +00:00
|
|
|
SDValue OpVal(nullptr, 0);
|
2006-04-08 07:14:26 +00:00
|
|
|
|
|
|
|
// If ByteSize of the splat is bigger than the element size of the
|
|
|
|
// build_vector, then we have a case where we are checking for a splat where
|
|
|
|
// multiple elements of the buildvector are folded together into a single
|
|
|
|
// logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
|
|
|
|
unsigned EltSize = 16/N->getNumOperands();
|
|
|
|
if (EltSize < ByteSize) {
|
|
|
|
unsigned Multiple = ByteSize/EltSize; // Number of BV entries per spltval.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue UniquedVals[4];
|
2006-04-08 07:14:26 +00:00
|
|
|
assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-08 07:14:26 +00:00
|
|
|
// See if all of the elements in the buildvector agree across.
|
|
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
|
|
|
if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
|
|
|
|
// If the element isn't a constant, bail fully out.
|
2008-07-27 21:46:04 +00:00
|
|
|
if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
|
2006-04-08 07:14:26 +00:00
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!UniquedVals[i&(Multiple-1)].getNode())
|
2006-04-08 07:14:26 +00:00
|
|
|
UniquedVals[i&(Multiple-1)] = N->getOperand(i);
|
|
|
|
else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue(); // no match.
|
2006-04-08 07:14:26 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-08 07:14:26 +00:00
|
|
|
// Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
|
|
|
|
// either constant or undef values that are identical for each chunk. See
|
|
|
|
// if these chunks can form into a larger vspltis*.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-08 07:14:26 +00:00
|
|
|
// Check to see if all of the leading entries are either 0 or -1. If
|
|
|
|
// neither, then this won't fit into the immediate field.
|
|
|
|
bool LeadingZero = true;
|
|
|
|
bool LeadingOnes = true;
|
|
|
|
for (unsigned i = 0; i != Multiple-1; ++i) {
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!UniquedVals[i].getNode()) continue; // Must have been undefs.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-08 07:14:26 +00:00
|
|
|
LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
|
|
|
|
LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
|
|
|
|
}
|
|
|
|
// Finally, check the least significant entry.
|
|
|
|
if (LeadingZero) {
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!UniquedVals[Multiple-1].getNode())
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getTargetConstant(0, MVT::i32); // 0,0,0,undef
|
2008-09-12 16:56:44 +00:00
|
|
|
int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
|
2006-04-08 07:14:26 +00:00
|
|
|
if (Val < 16)
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getTargetConstant(Val, MVT::i32); // 0,0,0,4 -> vspltisw(4)
|
2006-04-08 07:14:26 +00:00
|
|
|
}
|
|
|
|
if (LeadingOnes) {
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!UniquedVals[Multiple-1].getNode())
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getTargetConstant(~0U, MVT::i32); // -1,-1,-1,undef
|
2008-09-26 21:54:37 +00:00
|
|
|
int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
|
2006-04-08 07:14:26 +00:00
|
|
|
if (Val >= -16) // -1,-1,-1,-2 -> vspltisw(-2)
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getTargetConstant(Val, MVT::i32);
|
2006-04-08 07:14:26 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-04-08 07:14:26 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-25 06:12:06 +00:00
|
|
|
// Check to see if this buildvec has a single non-undef value in its elements.
|
|
|
|
for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
|
|
|
|
if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!OpVal.getNode())
|
2006-03-25 06:12:06 +00:00
|
|
|
OpVal = N->getOperand(i);
|
|
|
|
else if (OpVal != N->getOperand(i))
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-03-25 06:12:06 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!OpVal.getNode()) return SDValue(); // All UNDEF: use implicit def.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-05-24 02:03:36 +00:00
|
|
|
unsigned ValSizeInBytes = EltSize;
|
2006-03-28 04:15:58 +00:00
|
|
|
uint64_t Value = 0;
|
2006-03-25 06:12:06 +00:00
|
|
|
if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
|
2008-09-12 16:56:44 +00:00
|
|
|
Value = CN->getZExtValue();
|
2006-03-25 06:12:06 +00:00
|
|
|
} else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
|
2009-08-11 20:47:22 +00:00
|
|
|
assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
|
2007-08-31 04:03:46 +00:00
|
|
|
Value = FloatToBits(CN->getValueAPF().convertToFloat());
|
2006-03-25 06:12:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// If the splat value is larger than the element value, then we can never do
|
|
|
|
// this splat. The only case that we could fit the replicated bits into our
|
|
|
|
// immediate field for would be zero, and we prefer to use vxor for it.
|
2008-07-27 21:46:04 +00:00
|
|
|
if (ValSizeInBytes < ByteSize) return SDValue();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-25 06:12:06 +00:00
|
|
|
// If the element value is larger than the splat value, cut it in half and
|
|
|
|
// check to see if the two halves are equal. Continue doing this until we
|
|
|
|
// get to ByteSize. This allows us to handle 0x01010101 as 0x01.
|
|
|
|
while (ValSizeInBytes > ByteSize) {
|
|
|
|
ValSizeInBytes >>= 1;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-25 06:12:06 +00:00
|
|
|
// If the top half equals the bottom half, we're still ok.
|
2006-04-05 17:39:25 +00:00
|
|
|
if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
|
|
|
|
(Value & ((1 << (8*ValSizeInBytes))-1)))
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-03-25 06:12:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Properly sign extend the value.
|
2012-08-24 23:29:28 +00:00
|
|
|
int MaskVal = SignExtend32(Value, ByteSize * 8);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-26 09:52:32 +00:00
|
|
|
// If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
|
2008-07-27 21:46:04 +00:00
|
|
|
if (MaskVal == 0) return SDValue();
|
2006-03-25 06:12:06 +00:00
|
|
|
|
2006-04-08 06:46:53 +00:00
|
|
|
// Finally, if this value fits in a 5 bit sext field, return it
|
2012-08-24 23:29:28 +00:00
|
|
|
if (SignExtend32<5>(MaskVal) == MaskVal)
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getTargetConstant(MaskVal, MVT::i32);
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-03-25 06:12:06 +00:00
|
|
|
}
|
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Addressing Mode Selection
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
|
|
|
|
/// or 64-bit immediate, and if the value can be accurately represented as a
|
|
|
|
/// sign extension from a 16-bit value. If so, this returns true and the
|
|
|
|
/// immediate.
|
|
|
|
static bool isIntS16Immediate(SDNode *N, short &Imm) {
|
2014-05-20 17:20:34 +00:00
|
|
|
if (!isa<ConstantSDNode>(N))
|
2006-11-08 02:15:41 +00:00
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-09-12 16:56:44 +00:00
|
|
|
Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
|
2009-08-11 20:47:22 +00:00
|
|
|
if (N->getValueType(0) == MVT::i32)
|
2008-09-12 16:56:44 +00:00
|
|
|
return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-11-08 02:15:41 +00:00
|
|
|
else
|
2008-09-12 16:56:44 +00:00
|
|
|
return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
|
2006-11-08 02:15:41 +00:00
|
|
|
}
|
2008-07-27 21:46:04 +00:00
|
|
|
static bool isIntS16Immediate(SDValue Op, short &Imm) {
|
2008-08-28 21:40:38 +00:00
|
|
|
return isIntS16Immediate(Op.getNode(), Imm);
|
2006-11-08 02:15:41 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/// SelectAddressRegReg - Given the specified addressed, check to see if it
|
|
|
|
/// can be represented as an indexed [r+r] operation. Returns false if it
|
|
|
|
/// can be more efficiently represented with [r+imm].
|
2008-07-27 21:46:04 +00:00
|
|
|
bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
|
|
|
|
SDValue &Index,
|
2009-01-15 16:29:45 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2006-11-08 02:15:41 +00:00
|
|
|
short imm = 0;
|
|
|
|
if (N.getOpcode() == ISD::ADD) {
|
|
|
|
if (isIntS16Immediate(N.getOperand(1), imm))
|
|
|
|
return false; // r+i
|
|
|
|
if (N.getOperand(1).getOpcode() == PPCISD::Lo)
|
|
|
|
return false; // r+i
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
Base = N.getOperand(0);
|
|
|
|
Index = N.getOperand(1);
|
|
|
|
return true;
|
|
|
|
} else if (N.getOpcode() == ISD::OR) {
|
|
|
|
if (isIntS16Immediate(N.getOperand(1), imm))
|
|
|
|
return false; // r+i can fold it if we can.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
// If this is an or of disjoint bitfields, we can codegen this as an add
|
|
|
|
// (for better address arithmetic) if the LHS and RHS of the OR are provably
|
|
|
|
// disjoint.
|
2008-02-27 01:23:58 +00:00
|
|
|
APInt LHSKnownZero, LHSKnownOne;
|
|
|
|
APInt RHSKnownZero, RHSKnownOne;
|
2014-05-14 21:14:37 +00:00
|
|
|
DAG.computeKnownBits(N.getOperand(0),
|
|
|
|
LHSKnownZero, LHSKnownOne);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-02-27 01:23:58 +00:00
|
|
|
if (LHSKnownZero.getBoolValue()) {
|
2014-05-14 21:14:37 +00:00
|
|
|
DAG.computeKnownBits(N.getOperand(1),
|
|
|
|
RHSKnownZero, RHSKnownOne);
|
2006-11-08 02:15:41 +00:00
|
|
|
// If all of the bits are known zero on the LHS or RHS, the add won't
|
|
|
|
// carry.
|
2008-02-27 21:12:32 +00:00
|
|
|
if (~(LHSKnownZero | RHSKnownZero) == 0) {
|
2006-11-08 02:15:41 +00:00
|
|
|
Base = N.getOperand(0);
|
|
|
|
Index = N.getOperand(1);
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-07-09 06:34:51 +00:00
|
|
|
// If we happen to be doing an i64 load or store into a stack slot that has
|
|
|
|
// less than a 4-byte alignment, then the frame-index elimination may need to
|
|
|
|
// use an indexed load or store instruction (because the offset may not be a
|
|
|
|
// multiple of 4). The extra register needed to hold the offset comes from the
|
|
|
|
// register scavenger, and it is possible that the scavenger will need to use
|
|
|
|
// an emergency spill slot. As a result, we need to make sure that a spill slot
|
|
|
|
// is allocated when doing an i64 load/store into a less-than-4-byte-aligned
|
|
|
|
// stack slot.
|
|
|
|
static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
|
|
|
|
// FIXME: This does not handle the LWA case.
|
|
|
|
if (VT != MVT::i64)
|
|
|
|
return;
|
|
|
|
|
2013-07-10 15:29:01 +00:00
|
|
|
// NOTE: We'll exclude negative FIs here, which come from argument
|
|
|
|
// lowering, because there are no known test cases triggering this problem
|
|
|
|
// using packed structures (or similar). We can remove this exclusion if
|
|
|
|
// we find such a test case. The reason why this is so test-case driven is
|
|
|
|
// because this entire 'fixup' is only to prevent crashes (from the
|
|
|
|
// register scavenger) on not-really-valid inputs. For example, if we have:
|
|
|
|
// %a = alloca i1
|
|
|
|
// %b = bitcast i1* %a to i64*
|
|
|
|
// store i64* a, i64 b
|
|
|
|
// then the store should really be marked as 'align 1', but is not. If it
|
|
|
|
// were marked as 'align 1' then the indexed form would have been
|
|
|
|
// instruction-selected initially, and the problem this 'fixup' is preventing
|
|
|
|
// won't happen regardless.
|
2013-07-09 06:34:51 +00:00
|
|
|
if (FrameIdx < 0)
|
|
|
|
return;
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
|
|
|
|
|
|
unsigned Align = MFI->getObjectAlignment(FrameIdx);
|
|
|
|
if (Align >= 4)
|
|
|
|
return;
|
|
|
|
|
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
FuncInfo->setHasNonRISpills();
|
|
|
|
}
|
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
/// Returns true if the address N can be represented by a base register plus
|
|
|
|
/// a signed 16-bit displacement [r+imm], and if it is not better
|
2013-05-16 17:58:02 +00:00
|
|
|
/// represented as reg+reg. If Aligned is true, only accept displacements
|
|
|
|
/// suitable for STD and friends, i.e. multiples of 4.
|
2008-07-27 21:46:04 +00:00
|
|
|
bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
|
2009-01-15 16:29:45 +00:00
|
|
|
SDValue &Base,
|
2013-05-16 17:58:02 +00:00
|
|
|
SelectionDAG &DAG,
|
|
|
|
bool Aligned) const {
|
2009-02-06 19:16:40 +00:00
|
|
|
// FIXME dl should come from parent load or store, not from address
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(N);
|
2006-11-08 02:15:41 +00:00
|
|
|
// If this can be more profitably realized as r+r, fail.
|
|
|
|
if (SelectAddressRegReg(N, Disp, Base, DAG))
|
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
if (N.getOpcode() == ISD::ADD) {
|
|
|
|
short imm = 0;
|
2013-05-16 17:58:02 +00:00
|
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
|
|
(!Aligned || (imm & 3) == 0)) {
|
2013-05-16 14:53:05 +00:00
|
|
|
Disp = DAG.getTargetConstant(imm, N.getValueType());
|
2006-11-08 02:15:41 +00:00
|
|
|
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
|
|
|
|
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
|
2013-07-09 06:34:51 +00:00
|
|
|
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
|
2006-11-08 02:15:41 +00:00
|
|
|
} else {
|
|
|
|
Base = N.getOperand(0);
|
|
|
|
}
|
|
|
|
return true; // [r+i]
|
|
|
|
} else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
|
|
|
|
// Match LOAD (ADD (X, Lo(G))).
|
2012-04-20 11:41:38 +00:00
|
|
|
assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
|
2006-11-08 02:15:41 +00:00
|
|
|
&& "Cannot handle constant offsets yet!");
|
|
|
|
Disp = N.getOperand(1).getOperand(0); // The global address.
|
|
|
|
assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
|
2012-06-04 17:36:38 +00:00
|
|
|
Disp.getOpcode() == ISD::TargetGlobalTLSAddress ||
|
2006-11-08 02:15:41 +00:00
|
|
|
Disp.getOpcode() == ISD::TargetConstantPool ||
|
|
|
|
Disp.getOpcode() == ISD::TargetJumpTable);
|
|
|
|
Base = N.getOperand(0);
|
|
|
|
return true; // [&g+r]
|
|
|
|
}
|
|
|
|
} else if (N.getOpcode() == ISD::OR) {
|
|
|
|
short imm = 0;
|
2013-05-16 17:58:02 +00:00
|
|
|
if (isIntS16Immediate(N.getOperand(1), imm) &&
|
|
|
|
(!Aligned || (imm & 3) == 0)) {
|
2006-11-08 02:15:41 +00:00
|
|
|
// If this is an or of disjoint bitfields, we can codegen this as an add
|
|
|
|
// (for better address arithmetic) if the LHS and RHS of the OR are
|
|
|
|
// provably disjoint.
|
2008-02-27 01:23:58 +00:00
|
|
|
APInt LHSKnownZero, LHSKnownOne;
|
2014-05-14 21:14:37 +00:00
|
|
|
DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
|
2008-03-24 23:16:37 +00:00
|
|
|
|
2008-02-27 01:23:58 +00:00
|
|
|
if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
|
2006-11-08 02:15:41 +00:00
|
|
|
// If all of the bits are known zero on the LHS or RHS, the add won't
|
|
|
|
// carry.
|
|
|
|
Base = N.getOperand(0);
|
2013-05-16 14:53:05 +00:00
|
|
|
Disp = DAG.getTargetConstant(imm, N.getValueType());
|
2006-11-08 02:15:41 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
|
|
|
|
// Loading from a constant address.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
// If this address fits entirely in a 16-bit sext immediate field, codegen
|
|
|
|
// this as "d, 0"
|
|
|
|
short Imm;
|
2013-05-16 17:58:02 +00:00
|
|
|
if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) {
|
2006-11-08 02:15:41 +00:00
|
|
|
Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
|
2014-06-12 22:38:18 +00:00
|
|
|
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
|
2013-03-21 23:45:03 +00:00
|
|
|
CN->getValueType(0));
|
2006-11-08 02:15:41 +00:00
|
|
|
return true;
|
|
|
|
}
|
2007-02-17 06:44:03 +00:00
|
|
|
|
|
|
|
// Handle 32-bit sext immediates with LIS + addr mode.
|
2013-05-16 17:58:02 +00:00
|
|
|
if ((CN->getValueType(0) == MVT::i32 ||
|
|
|
|
(int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
|
|
|
|
(!Aligned || (CN->getZExtValue() & 3) == 0)) {
|
2008-09-12 16:56:44 +00:00
|
|
|
int Addr = (int)CN->getZExtValue();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
// Otherwise, break this down into an LIS + disp.
|
2009-08-11 20:47:22 +00:00
|
|
|
Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
|
|
|
|
unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
|
2009-09-25 18:54:59 +00:00
|
|
|
Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
|
2006-11-08 02:15:41 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
Disp = DAG.getTargetConstant(0, getPointerTy());
|
2013-07-09 06:34:51 +00:00
|
|
|
if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N)) {
|
2006-11-08 02:15:41 +00:00
|
|
|
Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
|
2013-07-09 06:34:51 +00:00
|
|
|
fixupFuncForFI(DAG, FI->getIndex(), N.getValueType());
|
|
|
|
} else
|
2006-11-08 02:15:41 +00:00
|
|
|
Base = N;
|
|
|
|
return true; // [r+0]
|
|
|
|
}
|
|
|
|
|
|
|
|
/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
|
|
|
|
/// represented as an indexed [r+r] operation.
|
2008-07-27 21:46:04 +00:00
|
|
|
bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
|
|
|
|
SDValue &Index,
|
2009-01-15 16:29:45 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2006-11-08 02:15:41 +00:00
|
|
|
// Check to see if we can easily represent this as an [r+r] address. This
|
|
|
|
// will fail if it thinks that the address is more profitably represented as
|
|
|
|
// reg+imm, e.g. where imm = 0.
|
|
|
|
if (SelectAddressRegReg(N, Base, Index, DAG))
|
|
|
|
return true;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
// If the operand is an addition, always emit this as [r+r], since this is
|
|
|
|
// better (for code size, and execution, as the memop does the add for free)
|
|
|
|
// than emitting an explicit add.
|
|
|
|
if (N.getOpcode() == ISD::ADD) {
|
|
|
|
Base = N.getOperand(0);
|
|
|
|
Index = N.getOperand(1);
|
|
|
|
return true;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-08 02:15:41 +00:00
|
|
|
// Otherwise, do it the hard way, using R0 as the base register.
|
2014-06-12 22:38:18 +00:00
|
|
|
Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
|
2013-03-21 23:45:03 +00:00
|
|
|
N.getValueType());
|
2006-11-08 02:15:41 +00:00
|
|
|
Index = N;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// getPreIndexedAddressParts - returns true by value, base pointer and
|
|
|
|
/// offset pointer and addressing mode by reference if the node's address
|
|
|
|
/// can be legally represented as pre-indexed load / store address.
|
2008-07-27 21:46:04 +00:00
|
|
|
bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
|
|
|
|
SDValue &Offset,
|
2006-11-09 17:55:04 +00:00
|
|
|
ISD::MemIndexedMode &AM,
|
2009-01-15 16:29:45 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2012-06-04 02:21:00 +00:00
|
|
|
if (DisablePPCPreinc) return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-03-22 14:58:48 +00:00
|
|
|
bool isLoad = true;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Ptr;
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT;
|
2013-03-18 23:00:58 +00:00
|
|
|
unsigned Alignment;
|
2006-11-08 02:15:41 +00:00
|
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
|
|
|
|
Ptr = LD->getBasePtr();
|
2008-01-30 00:15:11 +00:00
|
|
|
VT = LD->getMemoryVT();
|
2013-03-18 23:00:58 +00:00
|
|
|
Alignment = LD->getAlignment();
|
2006-11-08 02:15:41 +00:00
|
|
|
} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
|
2006-11-14 01:38:31 +00:00
|
|
|
Ptr = ST->getBasePtr();
|
2008-01-30 00:15:11 +00:00
|
|
|
VT = ST->getMemoryVT();
|
2013-03-18 23:00:58 +00:00
|
|
|
Alignment = ST->getAlignment();
|
2013-03-22 14:58:48 +00:00
|
|
|
isLoad = false;
|
2006-11-08 02:15:41 +00:00
|
|
|
} else
|
|
|
|
return false;
|
|
|
|
|
2006-11-14 01:38:31 +00:00
|
|
|
// PowerPC doesn't have preinc load/store instructions for vectors.
|
2008-06-06 12:08:01 +00:00
|
|
|
if (VT.isVector())
|
2006-11-14 01:38:31 +00:00
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-03-22 14:58:48 +00:00
|
|
|
if (SelectAddressRegReg(Ptr, Base, Offset, DAG)) {
|
|
|
|
|
|
|
|
// Common code will reject creating a pre-inc form if the base pointer
|
|
|
|
// is a frame index, or if N is a store and the base pointer is either
|
|
|
|
// the same as or a predecessor of the value being stored. Check for
|
|
|
|
// those situations here, and try with swapped Base/Offset instead.
|
|
|
|
bool Swap = false;
|
|
|
|
|
|
|
|
if (isa<FrameIndexSDNode>(Base) || isa<RegisterSDNode>(Base))
|
|
|
|
Swap = true;
|
|
|
|
else if (!isLoad) {
|
|
|
|
SDValue Val = cast<StoreSDNode>(N)->getValue();
|
|
|
|
if (Val == Base || Base.getNode()->isPredecessorOf(Val.getNode()))
|
|
|
|
Swap = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (Swap)
|
|
|
|
std::swap(Base, Offset);
|
|
|
|
|
2012-06-20 15:43:03 +00:00
|
|
|
AM = ISD::PRE_INC;
|
|
|
|
return true;
|
2012-06-19 02:34:32 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-05-16 17:58:02 +00:00
|
|
|
// LDU/STU can only handle immediates that are a multiple of 4.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (VT != MVT::i64) {
|
2013-05-16 17:58:02 +00:00
|
|
|
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false))
|
2006-11-15 19:55:13 +00:00
|
|
|
return false;
|
|
|
|
} else {
|
2013-03-18 23:00:58 +00:00
|
|
|
// LDU/STU need an address with at least 4-byte alignment.
|
|
|
|
if (Alignment < 4)
|
|
|
|
return false;
|
|
|
|
|
2013-05-16 17:58:02 +00:00
|
|
|
if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true))
|
2006-11-15 19:55:13 +00:00
|
|
|
return false;
|
|
|
|
}
|
2006-11-11 00:08:42 +00:00
|
|
|
|
|
|
|
if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
|
2006-11-15 19:55:13 +00:00
|
|
|
// PPC64 doesn't have lwau, but it does have lwaux. Reject preinc load of
|
|
|
|
// sext i32 to i64 when addr mode is r+i.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
|
2006-11-11 00:08:42 +00:00
|
|
|
LD->getExtensionType() == ISD::SEXTLOAD &&
|
|
|
|
isa<ConstantSDNode>(Offset))
|
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
}
|
|
|
|
|
2006-11-10 02:08:47 +00:00
|
|
|
AM = ISD::PRE_INC;
|
|
|
|
return true;
|
2006-11-08 02:15:41 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// LowerOperation implementation
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
/// GetLabelAccessInfo - Return true if we should reference labels using a
|
|
|
|
/// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
|
|
|
|
static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
|
2014-04-25 05:30:21 +00:00
|
|
|
unsigned &LoOpFlags,
|
|
|
|
const GlobalValue *GV = nullptr) {
|
2013-06-21 14:42:20 +00:00
|
|
|
HiOpFlags = PPCII::MO_HA;
|
|
|
|
LoOpFlags = PPCII::MO_LO;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
// Don't use the pic base if not in PIC relocation model. Or if we are on a
|
|
|
|
// non-darwin platform. We don't support PIC on other platforms yet.
|
2010-11-23 03:31:01 +00:00
|
|
|
bool isPIC = TM.getRelocationModel() == Reloc::PIC_ &&
|
2010-11-15 02:46:57 +00:00
|
|
|
TM.getSubtarget<PPCSubtarget>().isDarwin();
|
2010-11-15 03:13:19 +00:00
|
|
|
if (isPIC) {
|
|
|
|
HiOpFlags |= PPCII::MO_PIC_FLAG;
|
|
|
|
LoOpFlags |= PPCII::MO_PIC_FLAG;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If this is a reference to a global value that requires a non-lazy-ptr, make
|
|
|
|
// sure that instruction lowering adds it.
|
|
|
|
if (GV && TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM)) {
|
|
|
|
HiOpFlags |= PPCII::MO_NLP_FLAG;
|
|
|
|
LoOpFlags |= PPCII::MO_NLP_FLAG;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 03:13:19 +00:00
|
|
|
if (GV->hasHiddenVisibility()) {
|
|
|
|
HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
|
|
|
|
LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
|
|
|
|
}
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
return isPIC;
|
|
|
|
}
|
|
|
|
|
|
|
|
static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
|
|
|
|
SelectionDAG &DAG) {
|
|
|
|
EVT PtrVT = HiPart.getValueType();
|
|
|
|
SDValue Zero = DAG.getConstant(0, PtrVT);
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc DL(HiPart);
|
2010-11-15 02:46:57 +00:00
|
|
|
|
|
|
|
SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, HiPart, Zero);
|
|
|
|
SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, LoPart, Zero);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
// With PIC, the first instruction is actually "GR+hi(&G)".
|
|
|
|
if (isPIC)
|
|
|
|
Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
|
|
|
|
DAG.getNode(PPCISD::GlobalBaseReg, DL, PtrVT), Hi);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
// Generate non-pic code that has direct accesses to the constant pool.
|
|
|
|
// The address of the global is just (hi(&g)+lo(&g)).
|
|
|
|
return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
|
|
|
|
}
|
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = Op.getValueType();
|
2006-04-14 06:01:58 +00:00
|
|
|
ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
|
2010-04-15 01:51:59 +00:00
|
|
|
const Constant *C = CP->getConstVal();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2012-08-24 16:26:02 +00:00
|
|
|
// 64-bit SVR4 ABI code is always position-independent.
|
|
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
|
2012-08-24 16:26:02 +00:00
|
|
|
SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
|
2013-05-25 02:42:55 +00:00
|
|
|
return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(CP), MVT::i64, GA,
|
2012-08-24 16:26:02 +00:00
|
|
|
DAG.getRegister(PPC::X2, MVT::i64));
|
|
|
|
}
|
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
|
|
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
|
|
|
|
SDValue CPIHi =
|
|
|
|
DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
|
|
|
|
SDValue CPILo =
|
|
|
|
DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
|
|
|
|
return LowerLabelRef(CPIHi, CPILo, isPIC, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = Op.getValueType();
|
2006-04-22 18:53:45 +00:00
|
|
|
JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2012-08-24 16:26:02 +00:00
|
|
|
// 64-bit SVR4 ABI code is always position-independent.
|
|
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
|
2012-08-24 16:26:02 +00:00
|
|
|
SDValue GA = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
|
2013-05-25 02:42:55 +00:00
|
|
|
return DAG.getNode(PPCISD::TOC_ENTRY, SDLoc(JT), MVT::i64, GA,
|
2012-08-24 16:26:02 +00:00
|
|
|
DAG.getRegister(PPC::X2, MVT::i64));
|
|
|
|
}
|
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
|
|
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
|
|
|
|
SDValue JTIHi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOHiFlag);
|
|
|
|
SDValue JTILo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, MOLoFlag);
|
|
|
|
return LowerLabelRef(JTIHi, JTILo, isPIC, DAG);
|
2007-07-11 17:19:51 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2009-11-04 21:31:18 +00:00
|
|
|
EVT PtrVT = Op.getValueType();
|
|
|
|
|
2010-04-15 01:51:59 +00:00
|
|
|
const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 02:46:57 +00:00
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
|
|
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
|
2012-09-12 21:43:09 +00:00
|
|
|
SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
|
|
|
|
SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
|
2010-11-15 02:46:57 +00:00
|
|
|
return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
|
2009-11-04 21:31:18 +00:00
|
|
|
}
|
|
|
|
|
2012-06-04 17:36:38 +00:00
|
|
|
SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
|
2013-09-17 20:22:05 +00:00
|
|
|
// FIXME: TLS addresses currently use medium model code sequences,
|
|
|
|
// which is the most useful form. Eventually support for small and
|
|
|
|
// large models could be added if users need it, at the cost of
|
|
|
|
// additional complexity.
|
2012-06-04 17:36:38 +00:00
|
|
|
GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(GA);
|
2012-06-04 17:36:38 +00:00
|
|
|
const GlobalValue *GV = GA->getGlobal();
|
|
|
|
EVT PtrVT = getPointerTy();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool is64bit = Subtarget.isPPC64();
|
2012-06-04 17:36:38 +00:00
|
|
|
|
2012-12-04 16:18:08 +00:00
|
|
|
TLSModel::Model Model = getTargetMachine().getTLSModel(GV);
|
|
|
|
|
|
|
|
if (Model == TLSModel::LocalExec) {
|
|
|
|
SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
2013-06-21 14:42:20 +00:00
|
|
|
PPCII::MO_TPREL_HA);
|
2012-12-04 16:18:08 +00:00
|
|
|
SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
2013-06-21 14:42:20 +00:00
|
|
|
PPCII::MO_TPREL_LO);
|
2012-12-04 16:18:08 +00:00
|
|
|
SDValue TLSReg = DAG.getRegister(is64bit ? PPC::X13 : PPC::R2,
|
|
|
|
is64bit ? MVT::i64 : MVT::i32);
|
|
|
|
SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, TGAHi, TLSReg);
|
|
|
|
return DAG.getNode(PPCISD::Lo, dl, PtrVT, TGALo, Hi);
|
|
|
|
}
|
|
|
|
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169910 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-11 20:30:11 +00:00
|
|
|
if (Model == TLSModel::InitialExec) {
|
2012-12-13 18:45:54 +00:00
|
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
2013-07-05 12:22:36 +00:00
|
|
|
SDValue TGATLS = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
|
|
|
|
PPCII::MO_TLS);
|
2013-12-20 18:08:54 +00:00
|
|
|
SDValue GOTPtr;
|
|
|
|
if (is64bit) {
|
|
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
|
|
GOTPtr = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
|
|
|
|
PtrVT, GOTReg, TGA);
|
|
|
|
} else
|
|
|
|
GOTPtr = DAG.getNode(PPCISD::PPC32_GOT, dl, PtrVT);
|
This patch improves the 64-bit PowerPC InitialExec TLS support by providing
for a wider range of GOT entries that can hold thread-relative offsets.
This matches the behavior of GCC, which was not documented in the PPC64 TLS
ABI. The ABI will be updated with the new code sequence.
Former sequence:
ld 9,x@got@tprel(2)
add 9,9,x@tls
New sequence:
addis 9,2,x@got@tprel@ha
ld 9,x@got@tprel@l(9)
add 9,9,x@tls
Note that a linker optimization exists to transform the new sequence into
the shorter sequence when appropriate, by replacing the addis with a nop
and modifying the base register and relocation type of the ld.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170209 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-14 17:02:38 +00:00
|
|
|
SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
|
2013-12-20 18:08:54 +00:00
|
|
|
PtrVT, TGA, GOTPtr);
|
2013-07-05 12:22:36 +00:00
|
|
|
return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGATLS);
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169910 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-11 20:30:11 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (Model == TLSModel::GeneralDynamic) {
|
|
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
|
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
|
|
SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
|
|
|
|
GOTReg, TGA);
|
|
|
|
SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
|
|
|
|
GOTEntryHi, TGA);
|
|
|
|
|
|
|
|
// We need a chain node, and don't have one handy. The underlying
|
|
|
|
// call has no side effects, so using the function entry node
|
|
|
|
// suffices.
|
|
|
|
SDValue Chain = DAG.getEntryNode();
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
|
|
|
|
SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
|
|
|
|
SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
|
|
|
|
PtrVT, ParmReg, TGA);
|
2012-12-12 19:29:35 +00:00
|
|
|
// The return value from GET_TLS_ADDR really is in X3 already, but
|
|
|
|
// some hacks are needed here to tie everything together. The extra
|
|
|
|
// copies dissolve during subsequent transforms.
|
This patch implements the general dynamic TLS model for 64-bit PowerPC.
Given a thread-local symbol x with global-dynamic access, the generated
code to obtain x's address is:
Instruction Relocation Symbol
addis ra,r2,x@got@tlsgd@ha R_PPC64_GOT_TLSGD16_HA x
addi r3,ra,x@got@tlsgd@l R_PPC64_GOT_TLSGD16_L x
bl __tls_get_addr(x@tlsgd) R_PPC64_TLSGD x
R_PPC64_REL24 __tls_get_addr
nop
<use address in r3>
The implementation borrows from the medium code model work for introducing
special forms of ADDIS and ADDI into the DAG representation. This is made
slightly more complicated by having to introduce a call to the external
function __tls_get_addr. Using the full call machinery is overkill and,
more importantly, makes it difficult to add a special relocation. So I've
introduced another opcode GET_TLS_ADDR to represent the function call, and
surrounded it with register copies to set up the parameter and return value.
Most of the code is pretty straightforward. I ran into one peculiarity
when I introduced a new PPC opcode BL8_NOP_ELF_TLSGD, which is just like
BL8_NOP_ELF except that it takes another parameter to represent the symbol
("x" above) that requires a relocation on the call. Something in the
TblGen machinery causes BL8_NOP_ELF and BL8_NOP_ELF_TLSGD to be treated
identically during the emit phase, so this second operand was never
visited to generate relocations. This is the reason for the slightly
messy workaround in PPCMCCodeEmitter.cpp:getDirectBrEncoding().
Two new tests are included to demonstrate correct external assembly and
correct generation of relocations using the integrated assembler.
Comments welcome!
Thanks,
Bill
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@169910 91177308-0d34-0410-b5e6-96231b3b80d8
2012-12-11 20:30:11 +00:00
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
|
|
|
|
return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT);
|
|
|
|
}
|
|
|
|
|
2012-12-12 19:29:35 +00:00
|
|
|
if (Model == TLSModel::LocalDynamic) {
|
|
|
|
SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
|
|
|
|
SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
|
|
|
|
SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
|
|
|
|
GOTReg, TGA);
|
|
|
|
SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
|
|
|
|
GOTEntryHi, TGA);
|
|
|
|
|
|
|
|
// We need a chain node, and don't have one handy. The underlying
|
|
|
|
// call has no side effects, so using the function entry node
|
|
|
|
// suffices.
|
|
|
|
SDValue Chain = DAG.getEntryNode();
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
|
|
|
|
SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
|
|
|
|
SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
|
|
|
|
PtrVT, ParmReg, TGA);
|
|
|
|
// The return value from GET_TLSLD_ADDR really is in X3 already, but
|
|
|
|
// some hacks are needed here to tie everything together. The extra
|
|
|
|
// copies dissolve during subsequent transforms.
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
|
|
|
|
SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
|
2012-12-13 20:57:10 +00:00
|
|
|
Chain, ParmReg, TGA);
|
2012-12-12 19:29:35 +00:00
|
|
|
return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
|
|
|
|
}
|
|
|
|
|
|
|
|
llvm_unreachable("Unknown TLS model!");
|
2012-06-04 17:36:38 +00:00
|
|
|
}
|
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = Op.getValueType();
|
2006-04-14 06:01:58 +00:00
|
|
|
GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc DL(GSDN);
|
2010-07-06 22:08:15 +00:00
|
|
|
const GlobalValue *GV = GSDN->getGlobal();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// 64-bit SVR4 ABI code is always position-independent.
|
|
|
|
// The actual address of the GlobalValue is stored in the TOC.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI() && Subtarget.isPPC64()) {
|
2010-11-15 02:46:57 +00:00
|
|
|
SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
|
|
|
|
return DAG.getNode(PPCISD::TOC_ENTRY, DL, MVT::i64, GA,
|
2009-08-15 11:54:46 +00:00
|
|
|
DAG.getRegister(PPC::X2, MVT::i64));
|
|
|
|
}
|
|
|
|
|
2010-11-15 03:13:19 +00:00
|
|
|
unsigned MOHiFlag, MOLoFlag;
|
|
|
|
bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag, GV);
|
2010-11-15 02:46:57 +00:00
|
|
|
|
2010-11-15 03:13:19 +00:00
|
|
|
SDValue GAHi =
|
|
|
|
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOHiFlag);
|
|
|
|
SDValue GALo =
|
|
|
|
DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-15 03:13:19 +00:00
|
|
|
SDValue Ptr = LowerLabelRef(GAHi, GALo, isPIC, DAG);
|
|
|
|
|
|
|
|
// If the global reference is actually to a non-lazy-pointer, we have to do an
|
|
|
|
// extra load to get the address of the global.
|
|
|
|
if (MOHiFlag & PPCII::MO_NLP_FLAG)
|
|
|
|
Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2010-11-15 03:13:19 +00:00
|
|
|
return Ptr;
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
|
2006-04-14 06:01:58 +00:00
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-03-29 16:04:40 +00:00
|
|
|
if (Op.getValueType() == MVT::v2i64) {
|
|
|
|
// When the operands themselves are v2i64 values, we need to do something
|
|
|
|
// special because VSX has no underlying comparison operations for these.
|
|
|
|
if (Op.getOperand(0).getValueType() == MVT::v2i64) {
|
|
|
|
// Equality can be handled by casting to the legal type for Altivec
|
|
|
|
// comparisons, everything else needs to be expanded.
|
|
|
|
if (CC == ISD::SETEQ || CC == ISD::SETNE) {
|
|
|
|
return DAG.getNode(ISD::BITCAST, dl, MVT::v2i64,
|
|
|
|
DAG.getSetCC(dl, MVT::v4i32,
|
|
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0)),
|
|
|
|
DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(1)),
|
|
|
|
CC));
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
|
|
|
// We handle most of these in the usual way.
|
|
|
|
return Op;
|
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// If we're comparing for equality to zero, expose the fact that this is
|
|
|
|
// implented as a ctlz/srl pair on ppc, so that the dag combiner can
|
|
|
|
// fold the new nodes.
|
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
|
|
|
|
if (C->isNullValue() && CC == ISD::SETEQ) {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getOperand(0).getValueType();
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Zext = Op.getOperand(0);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (VT.bitsLT(MVT::i32)) {
|
|
|
|
VT = MVT::i32;
|
2009-02-04 01:48:28 +00:00
|
|
|
Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
|
2009-02-17 22:15:04 +00:00
|
|
|
}
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned Log2b = Log2_32(VT.getSizeInBits());
|
2009-02-04 01:48:28 +00:00
|
|
|
SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
|
|
|
|
SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getConstant(Log2b, MVT::i32));
|
|
|
|
return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
// Leave comparisons against 0 and -1 alone for now, since they're usually
|
2006-04-14 06:01:58 +00:00
|
|
|
// optimized. FIXME: revisit this when we can custom lower all setcc
|
|
|
|
// optimizations.
|
|
|
|
if (C->isAllOnesValue() || C->isNullValue())
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// If we have an integer seteq/setne, turn it into a compare against zero
|
lower "X = seteq Y, Z" to '(shr (ctlz (xor Y, Z)), 5)' instead of
'(shr (ctlz (sub Y, Z)), 5)'.
The use of xor better exposes the operation to bit-twiddling logic in the
dag combiner. For example, this:
typedef struct {
unsigned prefix : 4;
unsigned code : 4;
unsigned unsigned_p : 4;
} tree_common;
int foo(tree_common *a, tree_common *b) {
return a->code == b->code;
}
Now compiles to:
_foo:
lwz r2, 0(r4)
lwz r3, 0(r3)
xor r2, r3, r2
rlwinm r2, r2, 28, 28, 31
cntlzw r2, r2
srwi r3, r2, 5
blr
instead of:
_foo:
lbz r2, 3(r4)
lbz r3, 3(r3)
srwi r2, r2, 4
srwi r3, r3, 4
subf r2, r2, r3
cntlzw r2, r2
srwi r3, r2, 5
blr
saving a cycle.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@31725 91177308-0d34-0410-b5e6-96231b3b80d8
2006-11-14 05:28:08 +00:00
|
|
|
// by xor'ing the rhs with the lhs, which is faster than setting a
|
|
|
|
// condition register, reading it back out, and masking the correct bit. The
|
|
|
|
// normal approach here uses sub to do this instead of xor. Using xor exposes
|
|
|
|
// the result to other bit-twiddling opportunities.
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT LHSVT = Op.getOperand(0).getValueType();
|
2008-06-06 12:08:01 +00:00
|
|
|
if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
|
2006-04-14 06:01:58 +00:00
|
|
|
Op.getOperand(1));
|
2009-02-04 01:48:28 +00:00
|
|
|
return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
const PPCSubtarget &Subtarget) const {
|
2011-06-28 15:30:42 +00:00
|
|
|
SDNode *Node = Op.getNode();
|
|
|
|
EVT VT = Node->getValueType(0);
|
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
|
|
|
SDValue InChain = Node->getOperand(0);
|
|
|
|
SDValue VAListPtr = Node->getOperand(1);
|
|
|
|
const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Node);
|
2011-06-28 15:30:42 +00:00
|
|
|
|
|
|
|
assert(!Subtarget.isPPC64() && "LowerVAARG is PPC32 only");
|
|
|
|
|
|
|
|
// gpr_index
|
|
|
|
SDValue GprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
|
|
|
|
VAListPtr, MachinePointerInfo(SV), MVT::i8,
|
|
|
|
false, false, 0);
|
|
|
|
InChain = GprIndex.getValue(1);
|
|
|
|
|
|
|
|
if (VT == MVT::i64) {
|
|
|
|
// Check if GprIndex is even
|
|
|
|
SDValue GprAnd = DAG.getNode(ISD::AND, dl, MVT::i32, GprIndex,
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
|
|
|
SDValue CC64 = DAG.getSetCC(dl, MVT::i32, GprAnd,
|
|
|
|
DAG.getConstant(0, MVT::i32), ISD::SETNE);
|
|
|
|
SDValue GprIndexPlusOne = DAG.getNode(ISD::ADD, dl, MVT::i32, GprIndex,
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
|
|
|
// Align GprIndex to be even if it isn't
|
|
|
|
GprIndex = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC64, GprIndexPlusOne,
|
|
|
|
GprIndex);
|
|
|
|
}
|
|
|
|
|
|
|
|
// fpr index is 1 byte after gpr
|
|
|
|
SDValue FprPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
|
|
|
|
|
|
|
// fpr
|
|
|
|
SDValue FprIndex = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, InChain,
|
|
|
|
FprPtr, MachinePointerInfo(SV), MVT::i8,
|
|
|
|
false, false, 0);
|
|
|
|
InChain = FprIndex.getValue(1);
|
|
|
|
|
|
|
|
SDValue RegSaveAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
|
|
DAG.getConstant(8, MVT::i32));
|
|
|
|
|
|
|
|
SDValue OverflowAreaPtr = DAG.getNode(ISD::ADD, dl, PtrVT, VAListPtr,
|
|
|
|
DAG.getConstant(4, MVT::i32));
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2011-06-28 15:30:42 +00:00
|
|
|
// areas
|
|
|
|
SDValue OverflowArea = DAG.getLoad(MVT::i32, dl, InChain, OverflowAreaPtr,
|
2011-11-08 18:42:53 +00:00
|
|
|
MachinePointerInfo(), false, false,
|
|
|
|
false, 0);
|
2011-06-28 15:30:42 +00:00
|
|
|
InChain = OverflowArea.getValue(1);
|
|
|
|
|
|
|
|
SDValue RegSaveArea = DAG.getLoad(MVT::i32, dl, InChain, RegSaveAreaPtr,
|
2011-11-08 18:42:53 +00:00
|
|
|
MachinePointerInfo(), false, false,
|
|
|
|
false, 0);
|
2011-06-28 15:30:42 +00:00
|
|
|
InChain = RegSaveArea.getValue(1);
|
|
|
|
|
|
|
|
// select overflow_area if index > 8
|
|
|
|
SDValue CC = DAG.getSetCC(dl, MVT::i32, VT.isInteger() ? GprIndex : FprIndex,
|
|
|
|
DAG.getConstant(8, MVT::i32), ISD::SETLT);
|
|
|
|
|
|
|
|
// adjustment constant gpr_index * 4/8
|
|
|
|
SDValue RegConstant = DAG.getNode(ISD::MUL, dl, MVT::i32,
|
|
|
|
VT.isInteger() ? GprIndex : FprIndex,
|
|
|
|
DAG.getConstant(VT.isInteger() ? 4 : 8,
|
|
|
|
MVT::i32));
|
|
|
|
|
|
|
|
// OurReg = RegSaveArea + RegConstant
|
|
|
|
SDValue OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, RegSaveArea,
|
|
|
|
RegConstant);
|
|
|
|
|
|
|
|
// Floating types are 32 bytes into RegSaveArea
|
|
|
|
if (VT.isFloatingPoint())
|
|
|
|
OurReg = DAG.getNode(ISD::ADD, dl, PtrVT, OurReg,
|
|
|
|
DAG.getConstant(32, MVT::i32));
|
|
|
|
|
|
|
|
// increase {f,g}pr_index by 1 (or 2 if VT is i64)
|
|
|
|
SDValue IndexPlus1 = DAG.getNode(ISD::ADD, dl, MVT::i32,
|
|
|
|
VT.isInteger() ? GprIndex : FprIndex,
|
|
|
|
DAG.getConstant(VT == MVT::i64 ? 2 : 1,
|
|
|
|
MVT::i32));
|
|
|
|
|
|
|
|
InChain = DAG.getTruncStore(InChain, dl, IndexPlus1,
|
|
|
|
VT.isInteger() ? VAListPtr : FprPtr,
|
|
|
|
MachinePointerInfo(SV),
|
|
|
|
MVT::i8, false, false, 0);
|
|
|
|
|
|
|
|
// determine if we should load from reg_save_area or overflow_area
|
|
|
|
SDValue Result = DAG.getNode(ISD::SELECT, dl, PtrVT, CC, OurReg, OverflowArea);
|
|
|
|
|
|
|
|
// increase overflow_area by 4/8 if gpr/fpr > 8
|
|
|
|
SDValue OverflowAreaPlusN = DAG.getNode(ISD::ADD, dl, PtrVT, OverflowArea,
|
|
|
|
DAG.getConstant(VT.isInteger() ? 4 : 8,
|
|
|
|
MVT::i32));
|
|
|
|
|
|
|
|
OverflowArea = DAG.getNode(ISD::SELECT, dl, MVT::i32, CC, OverflowArea,
|
|
|
|
OverflowAreaPlusN);
|
|
|
|
|
|
|
|
InChain = DAG.getTruncStore(InChain, dl, OverflowArea,
|
|
|
|
OverflowAreaPtr,
|
|
|
|
MachinePointerInfo(),
|
|
|
|
MVT::i32, false, false, 0);
|
|
|
|
|
2012-08-30 15:52:23 +00:00
|
|
|
return DAG.getLoad(VT, dl, InChain, Result, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2007-04-03 13:59:52 +00:00
|
|
|
}
|
|
|
|
|
2013-07-25 21:36:47 +00:00
|
|
|
SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG,
|
|
|
|
const PPCSubtarget &Subtarget) const {
|
|
|
|
assert(!Subtarget.isPPC64() && "LowerVACOPY is PPC32 only");
|
|
|
|
|
|
|
|
// We have to copy the entire va_list struct:
|
|
|
|
// 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
|
|
|
|
return DAG.getMemcpy(Op.getOperand(0), Op,
|
|
|
|
Op.getOperand(1), Op.getOperand(2),
|
|
|
|
DAG.getConstant(12, MVT::i32), 8, false, true,
|
|
|
|
MachinePointerInfo(), MachinePointerInfo());
|
|
|
|
}
|
|
|
|
|
2011-09-06 13:37:06 +00:00
|
|
|
SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
return Op.getOperand(0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2008-09-17 00:30:57 +00:00
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
SDValue Trmp = Op.getOperand(1); // trampoline
|
|
|
|
SDValue FPtr = Op.getOperand(2); // nested function
|
|
|
|
SDValue Nest = Op.getOperand(3); // 'nest' parameter value
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-09-17 00:30:57 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-08-11 20:47:22 +00:00
|
|
|
bool isPPC64 = (PtrVT == MVT::i64);
|
2011-07-18 04:54:35 +00:00
|
|
|
Type *IntPtrTy =
|
2012-10-08 16:38:25 +00:00
|
|
|
DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
|
2012-11-01 08:07:29 +00:00
|
|
|
*DAG.getContext());
|
2008-09-17 00:30:57 +00:00
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
TargetLowering::ArgListTy Args;
|
2008-09-17 00:30:57 +00:00
|
|
|
TargetLowering::ArgListEntry Entry;
|
|
|
|
|
|
|
|
Entry.Ty = IntPtrTy;
|
|
|
|
Entry.Node = Trmp; Args.push_back(Entry);
|
|
|
|
|
|
|
|
// TrampSize == (isPPC64 ? 48 : 40);
|
|
|
|
Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40,
|
2009-08-11 20:47:22 +00:00
|
|
|
isPPC64 ? MVT::i64 : MVT::i32);
|
2008-09-17 00:30:57 +00:00
|
|
|
Args.push_back(Entry);
|
|
|
|
|
|
|
|
Entry.Node = FPtr; Args.push_back(Entry);
|
|
|
|
Entry.Node = Nest; Args.push_back(Entry);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-09-17 00:30:57 +00:00
|
|
|
// Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
|
2014-05-17 21:50:17 +00:00
|
|
|
TargetLowering::CallLoweringInfo CLI(DAG);
|
|
|
|
CLI.setDebugLoc(dl).setChain(Chain)
|
|
|
|
.setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
|
2014-07-01 22:01:54 +00:00
|
|
|
DAG.getExternalSymbol("__trampoline_setup", PtrVT),
|
|
|
|
std::move(Args), 0);
|
2008-09-17 00:30:57 +00:00
|
|
|
|
2014-05-17 21:50:17 +00:00
|
|
|
std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
|
2011-09-06 13:37:06 +00:00
|
|
|
return CallResult.second;
|
2008-09-17 00:30:57 +00:00
|
|
|
}
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
const PPCSubtarget &Subtarget) const {
|
2010-04-17 14:41:14 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2007-04-03 13:59:52 +00:00
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
|
2007-04-03 13:59:52 +00:00
|
|
|
// vastart just stores the address of the VarArgsFrameIndex slot into the
|
|
|
|
// memory location argument.
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2010-04-17 14:41:14 +00:00
|
|
|
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
2008-02-06 22:27:42 +00:00
|
|
|
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
|
2010-09-21 18:41:36 +00:00
|
|
|
return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
|
|
|
|
MachinePointerInfo(SV),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2007-04-03 13:59:52 +00:00
|
|
|
}
|
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
|
2007-04-03 13:59:52 +00:00
|
|
|
// We suppose the given va_list is already allocated.
|
|
|
|
//
|
|
|
|
// typedef struct {
|
|
|
|
// char gpr; /* index into the array of 8 GPRs
|
|
|
|
// * stored in the register save area
|
|
|
|
// * gpr=0 corresponds to r3,
|
|
|
|
// * gpr=1 to r4, etc.
|
|
|
|
// */
|
|
|
|
// char fpr; /* index into the array of 8 FPRs
|
|
|
|
// * stored in the register save area
|
|
|
|
// * fpr=0 corresponds to f1,
|
|
|
|
// * fpr=1 to f2, etc.
|
|
|
|
// */
|
|
|
|
// char *overflow_arg_area;
|
|
|
|
// /* location on stack that holds
|
|
|
|
// * the next overflow argument
|
|
|
|
// */
|
|
|
|
// char *reg_save_area;
|
|
|
|
// /* where r3:r10 and f1:f8 (if saved)
|
|
|
|
// * are stored
|
|
|
|
// */
|
|
|
|
// } va_list[1];
|
|
|
|
|
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
SDValue ArgGPR = DAG.getConstant(FuncInfo->getVarArgsNumGPR(), MVT::i32);
|
|
|
|
SDValue ArgFPR = DAG.getConstant(FuncInfo->getVarArgsNumFPR(), MVT::i32);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-04-03 13:59:52 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
SDValue StackOffsetFI = DAG.getFrameIndex(FuncInfo->getVarArgsStackOffset(),
|
|
|
|
PtrVT);
|
|
|
|
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
|
|
|
|
PtrVT);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-06-06 12:08:01 +00:00
|
|
|
uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT);
|
2008-02-06 22:27:42 +00:00
|
|
|
|
2008-06-06 12:08:01 +00:00
|
|
|
uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT);
|
2008-02-06 22:27:42 +00:00
|
|
|
|
|
|
|
uint64_t FPROffset = 1;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-02-06 22:27:42 +00:00
|
|
|
const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-04-03 13:59:52 +00:00
|
|
|
// Store first byte : number of int regs
|
2009-07-03 06:45:56 +00:00
|
|
|
SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR,
|
2010-09-21 17:42:31 +00:00
|
|
|
Op.getOperand(1),
|
|
|
|
MachinePointerInfo(SV),
|
|
|
|
MVT::i8, false, false, 0);
|
2008-02-06 22:27:42 +00:00
|
|
|
uint64_t nextOffset = FPROffset;
|
2009-02-04 20:06:27 +00:00
|
|
|
SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
|
2007-04-03 13:59:52 +00:00
|
|
|
ConstFPROffset);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-04-03 13:59:52 +00:00
|
|
|
// Store second byte : number of float regs
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue secondStore =
|
2010-09-21 17:42:31 +00:00
|
|
|
DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr,
|
|
|
|
MachinePointerInfo(SV, nextOffset), MVT::i8,
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2008-02-06 22:27:42 +00:00
|
|
|
nextOffset += StackOffset;
|
2009-02-04 20:06:27 +00:00
|
|
|
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-04-03 13:59:52 +00:00
|
|
|
// Store second word : arguments given on stack
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue thirdStore =
|
2010-09-21 18:41:36 +00:00
|
|
|
DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr,
|
|
|
|
MachinePointerInfo(SV, nextOffset),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2008-02-06 22:27:42 +00:00
|
|
|
nextOffset += FrameOffset;
|
2009-02-04 20:06:27 +00:00
|
|
|
nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
|
2007-04-03 13:59:52 +00:00
|
|
|
|
|
|
|
// Store third word : arguments given in registers
|
2010-09-21 18:41:36 +00:00
|
|
|
return DAG.getStore(thirdStore, dl, FR, nextPtr,
|
|
|
|
MachinePointerInfo(SV, nextOffset),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2007-04-03 13:59:52 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2007-03-06 00:59:59 +00:00
|
|
|
#include "PPCGenCallingConv.inc"
|
|
|
|
|
2013-08-26 20:11:46 +00:00
|
|
|
// Function whose sole purpose is to kill compiler warnings
|
|
|
|
// stemming from unused functions included from PPCGenCallingConv.inc.
|
|
|
|
CCAssignFn *PPCTargetLowering::useFastISelCCs(unsigned Flag) const {
|
2013-08-30 22:18:55 +00:00
|
|
|
return Flag ? CC_PPC64_ELF_FIS : RetCC_PPC64_ELF_FIS;
|
2013-08-26 20:11:46 +00:00
|
|
|
}
|
|
|
|
|
2013-06-12 16:39:22 +00:00
|
|
|
bool llvm::CC_PPC32_SVR4_Custom_Dummy(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
|
|
|
|
CCValAssign::LocInfo &LocInfo,
|
|
|
|
ISD::ArgFlagsTy &ArgFlags,
|
|
|
|
CCState &State) {
|
2009-07-03 06:45:56 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2013-06-12 16:39:22 +00:00
|
|
|
bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
|
|
|
|
MVT &LocVT,
|
|
|
|
CCValAssign::LocInfo &LocInfo,
|
|
|
|
ISD::ArgFlagsTy &ArgFlags,
|
|
|
|
CCState &State) {
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg ArgRegs[] = {
|
2009-07-03 06:45:56 +00:00
|
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
|
|
};
|
|
|
|
const unsigned NumArgRegs = array_lengthof(ArgRegs);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
|
|
|
|
|
|
|
|
// Skip one register if the first unallocated register has an even register
|
|
|
|
// number and there are still argument registers available which have not been
|
|
|
|
// allocated yet. RegNum is actually an index into ArgRegs, which means we
|
|
|
|
// need to skip a register if RegNum is odd.
|
|
|
|
if (RegNum != NumArgRegs && RegNum % 2 == 1) {
|
|
|
|
State.AllocateReg(ArgRegs[RegNum]);
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Always return false here, as this function only makes sure that the first
|
|
|
|
// unallocated register has an odd register number and does not actually
|
|
|
|
// allocate a register for the current argument.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2013-06-12 16:39:22 +00:00
|
|
|
bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
|
|
|
|
MVT &LocVT,
|
|
|
|
CCValAssign::LocInfo &LocInfo,
|
|
|
|
ISD::ArgFlagsTy &ArgFlags,
|
|
|
|
CCState &State) {
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg ArgRegs[] = {
|
2009-07-03 06:45:56 +00:00
|
|
|
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
|
|
|
|
PPC::F8
|
|
|
|
};
|
|
|
|
|
|
|
|
const unsigned NumArgRegs = array_lengthof(ArgRegs);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
|
|
|
|
|
|
|
|
// If there is only one Floating-point register left we need to put both f64
|
|
|
|
// values of a split ppc_fp128 value on the stack.
|
|
|
|
if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
|
|
|
|
State.AllocateReg(ArgRegs[RegNum]);
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Always return false here, as this function only makes sure that the two f64
|
|
|
|
// values a ppc_fp128 value is split into are both passed in registers or both
|
|
|
|
// passed on the stack and does not actually allocate a register for the
|
|
|
|
// current argument.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2007-02-25 05:34:32 +00:00
|
|
|
/// GetFPR - Get the set of FP registers that should be allocated for arguments,
|
2009-08-15 11:54:46 +00:00
|
|
|
/// on Darwin.
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg *GetFPR() {
|
|
|
|
static const MCPhysReg FPR[] = {
|
2007-02-25 05:34:32 +00:00
|
|
|
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
|
2009-08-15 11:54:46 +00:00
|
|
|
PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
|
2007-02-25 05:34:32 +00:00
|
|
|
};
|
2009-08-15 11:54:46 +00:00
|
|
|
|
2007-02-25 05:34:32 +00:00
|
|
|
return FPR;
|
|
|
|
}
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
/// CalculateStackSlotSize - Calculates the size reserved for this argument on
|
|
|
|
/// the stack.
|
2009-08-10 22:56:29 +00:00
|
|
|
static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
|
2009-07-03 06:43:35 +00:00
|
|
|
unsigned PtrByteSize) {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
unsigned ArgSize = ArgVT.getStoreSize();
|
2008-04-30 09:16:33 +00:00
|
|
|
if (Flags.isByVal())
|
|
|
|
ArgSize = Flags.getByValSize();
|
|
|
|
ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
|
|
|
|
|
|
return ArgSize;
|
|
|
|
}
|
2014-07-07 19:26:41 +00:00
|
|
|
|
|
|
|
/// CalculateStackSlotAlignment - Calculates the alignment of this argument
|
|
|
|
/// on the stack.
|
|
|
|
static unsigned CalculateStackSlotAlignment(EVT ArgVT, ISD::ArgFlagsTy Flags,
|
|
|
|
unsigned PtrByteSize) {
|
|
|
|
unsigned Align = PtrByteSize;
|
|
|
|
|
|
|
|
// Altivec parameters are padded to a 16 byte boundary.
|
|
|
|
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
|
|
|
|
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
|
|
|
|
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64)
|
|
|
|
Align = 16;
|
|
|
|
|
|
|
|
// ByVal parameters are aligned as requested.
|
|
|
|
if (Flags.isByVal()) {
|
|
|
|
unsigned BVAlign = Flags.getByValAlign();
|
|
|
|
if (BVAlign > PtrByteSize) {
|
|
|
|
if (BVAlign % PtrByteSize != 0)
|
|
|
|
llvm_unreachable(
|
|
|
|
"ByVal alignment is not a multiple of the pointer size");
|
|
|
|
|
|
|
|
Align = BVAlign;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Align;
|
|
|
|
}
|
|
|
|
|
2014-06-23 13:08:27 +00:00
|
|
|
/// EnsureStackAlignment - Round stack frame size up from NumBytes to
|
|
|
|
/// ensure minimum alignment required for target.
|
|
|
|
static unsigned EnsureStackAlignment(const TargetMachine &Target,
|
|
|
|
unsigned NumBytes) {
|
|
|
|
unsigned TargetAlign = Target.getFrameLowering()->getStackAlignment();
|
|
|
|
unsigned AlignMask = TargetAlign - 1;
|
|
|
|
NumBytes = (NumBytes + AlignMask) & ~AlignMask;
|
|
|
|
return NumBytes;
|
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
SDValue
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
PPCTargetLowering::LowerFormalArguments(SDValue Chain,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::InputArg>
|
|
|
|
&Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals)
|
|
|
|
const {
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI()) {
|
|
|
|
if (Subtarget.isPPC64())
|
2012-10-05 21:27:08 +00:00
|
|
|
return LowerFormalArguments_64SVR4(Chain, CallConv, isVarArg, Ins,
|
|
|
|
dl, DAG, InVals);
|
|
|
|
else
|
|
|
|
return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
|
|
|
|
dl, DAG, InVals);
|
2012-09-19 15:42:13 +00:00
|
|
|
} else {
|
2012-10-05 21:27:08 +00:00
|
|
|
return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
|
|
|
|
dl, DAG, InVals);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue
|
2012-09-19 15:42:13 +00:00
|
|
|
PPCTargetLowering::LowerFormalArguments_32SVR4(
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Chain,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::InputArg>
|
|
|
|
&Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// 32-bit SVR4 ABI Stack Frame Layout:
|
2009-07-03 06:45:56 +00:00
|
|
|
// +-----------------------------------+
|
|
|
|
// +--> | Back chain |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | Floating-point register save area |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | General register save area |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | CR save word |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | VRSAVE save word |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | Alignment padding |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | Vector register save area |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | Local variable space |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | Parameter list area |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// | | LR save word |
|
|
|
|
// | +-----------------------------------+
|
|
|
|
// SP--> +--- | Back chain |
|
|
|
|
// +-----------------------------------+
|
|
|
|
//
|
|
|
|
// Specifications:
|
|
|
|
// System V Application Binary Interface PowerPC Processor Supplement
|
|
|
|
// AltiVec Technology Programming Interface Manual
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
2010-04-17 14:41:14 +00:00
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-07-03 06:45:56 +00:00
|
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
2011-12-02 22:16:29 +00:00
|
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
(CallConv == CallingConv::Fast));
|
2009-07-03 06:45:56 +00:00
|
|
|
unsigned PtrByteSize = 4;
|
|
|
|
|
|
|
|
// Assign locations to all of the incoming arguments.
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), ArgLocs, *DAG.getContext());
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Reserve space for the linkage area on the stack.
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned LinkageSize = PPCFrameLowering::getLinkageSize(false, false);
|
|
|
|
CCInfo.AllocateStack(LinkageSize, PtrByteSize);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2013-02-06 17:33:58 +00:00
|
|
|
CCInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
|
|
|
|
CCValAssign &VA = ArgLocs[i];
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Arguments stored in registers.
|
|
|
|
if (VA.isRegLoc()) {
|
2012-02-22 05:59:10 +00:00
|
|
|
const TargetRegisterClass *RC;
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT ValVT = VA.getValVT();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
switch (ValVT.getSimpleVT().SimpleTy) {
|
2009-07-03 06:45:56 +00:00
|
|
|
default:
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
llvm_unreachable("ValVT not supported by formal arguments Lowering");
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case MVT::i1:
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i32:
|
2012-04-20 06:31:50 +00:00
|
|
|
RC = &PPC::GPRCRegClass;
|
2009-07-03 06:45:56 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::f32:
|
2012-04-20 06:31:50 +00:00
|
|
|
RC = &PPC::F4RCRegClass;
|
2009-07-03 06:45:56 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::f64:
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasVSX())
|
2014-03-29 05:29:01 +00:00
|
|
|
RC = &PPC::VSFRCRegClass;
|
|
|
|
else
|
|
|
|
RC = &PPC::F8RCRegClass;
|
2009-07-03 06:45:56 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::v16i8:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v4f32:
|
2014-03-28 19:58:11 +00:00
|
|
|
RC = &PPC::VRRCRegClass;
|
|
|
|
break;
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
case MVT::v2f64:
|
2014-03-26 16:12:58 +00:00
|
|
|
case MVT::v2i64:
|
2014-03-28 19:58:11 +00:00
|
|
|
RC = &PPC::VSHRCRegClass;
|
2009-07-03 06:45:56 +00:00
|
|
|
break;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Transform the arguments stored in physical registers into virtual ones.
|
2011-02-21 23:21:26 +00:00
|
|
|
unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg,
|
|
|
|
ValVT == MVT::i1 ? MVT::i32 : ValVT);
|
|
|
|
|
|
|
|
if (ValVT == MVT::i1)
|
|
|
|
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgValue);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
InVals.push_back(ArgValue);
|
2009-07-03 06:45:56 +00:00
|
|
|
} else {
|
|
|
|
// Argument stored in memory.
|
|
|
|
assert(VA.isMemLoc());
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
unsigned ArgSize = VA.getLocVT().getStoreSize();
|
2009-07-03 06:45:56 +00:00
|
|
|
int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
|
2010-07-03 00:40:23 +00:00
|
|
|
isImmutable);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Create load nodes to retrieve arguments from the stack.
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
2010-09-21 06:44:06 +00:00
|
|
|
InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
|
|
|
|
MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0));
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Assign locations to all of the incoming aggregate by value arguments.
|
|
|
|
// Aggregates passed by value are stored in the local variable space of the
|
|
|
|
// caller's stack frame, right above the parameter list area.
|
|
|
|
SmallVector<CCValAssign, 16> ByValArgLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), ByValArgLocs, *DAG.getContext());
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Reserve stack space for the allocations in CCInfo.
|
|
|
|
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
|
|
|
|
|
2013-02-06 17:33:58 +00:00
|
|
|
CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
|
|
|
unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
|
2014-06-23 14:15:53 +00:00
|
|
|
MinReservedArea = std::max(MinReservedArea, LinkageSize);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
|
|
|
// call optimized function's reserved stack space needs to be aligned so that
|
|
|
|
// taking the difference between two stack areas will result in an aligned
|
|
|
|
// stack.
|
2014-06-23 13:08:27 +00:00
|
|
|
MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
|
|
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
SmallVector<SDValue, 8> MemOps;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
|
|
if (isVarArg) {
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPArgRegs[] = {
|
2009-07-03 06:45:56 +00:00
|
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
|
|
};
|
|
|
|
const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
|
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg FPArgRegs[] = {
|
2009-07-03 06:45:56 +00:00
|
|
|
PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
|
|
|
|
PPC::F8
|
|
|
|
};
|
|
|
|
const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
|
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
FuncInfo->setVarArgsNumGPR(CCInfo.getFirstUnallocated(GPArgRegs,
|
|
|
|
NumGPArgRegs));
|
|
|
|
FuncInfo->setVarArgsNumFPR(CCInfo.getFirstUnallocated(FPArgRegs,
|
|
|
|
NumFPArgRegs));
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Make room for NumGPArgRegs and NumFPArgRegs.
|
|
|
|
int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
|
2009-08-11 20:47:22 +00:00
|
|
|
NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
FuncInfo->setVarArgsStackOffset(
|
|
|
|
MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
|
2010-07-03 00:40:23 +00:00
|
|
|
CCInfo.getNextStackOffset(), true));
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
FuncInfo->setVarArgsFrameIndex(MFI->CreateStackObject(Depth, 8, false));
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2010-10-11 20:43:09 +00:00
|
|
|
// The fixed integer arguments of a variadic function are stored to the
|
|
|
|
// VarArgsFrameIndex on the stack so that they may be loaded by deferencing
|
|
|
|
// the result of va_next.
|
|
|
|
for (unsigned GPRIndex = 0; GPRIndex != NumGPArgRegs; ++GPRIndex) {
|
|
|
|
// Get an existing live-in vreg, or add a new one.
|
|
|
|
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(GPArgRegs[GPRIndex]);
|
|
|
|
if (!VReg)
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
2010-09-21 18:41:36 +00:00
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
2009-07-03 06:45:56 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
// Increment the address by four for the next argument to store
|
|
|
|
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
|
|
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
|
|
}
|
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
|
|
|
|
// is set.
|
2009-07-03 06:45:56 +00:00
|
|
|
// The double arguments are stored to the VarArgsFrameIndex
|
|
|
|
// on the stack.
|
2010-10-11 20:43:09 +00:00
|
|
|
for (unsigned FPRIndex = 0; FPRIndex != NumFPArgRegs; ++FPRIndex) {
|
|
|
|
// Get an existing live-in vreg, or add a new one.
|
|
|
|
unsigned VReg = MF.getRegInfo().getLiveInVirtReg(FPArgRegs[FPRIndex]);
|
|
|
|
if (!VReg)
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
|
2010-09-21 18:41:36 +00:00
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
2009-07-03 06:45:56 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
// Increment the address by eight for the next argument to store
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
|
2009-07-03 06:45:56 +00:00
|
|
|
PtrVT);
|
|
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!MemOps.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return Chain;
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
|
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT,
|
|
|
|
SelectionDAG &DAG, SDValue ArgVal,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) const {
|
2012-10-23 15:51:16 +00:00
|
|
|
if (Flags.isSExt())
|
|
|
|
ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
|
|
|
|
DAG.getValueType(ObjectVT));
|
|
|
|
else if (Flags.isZExt())
|
|
|
|
ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
|
|
|
|
DAG.getValueType(ObjectVT));
|
2013-05-18 00:21:46 +00:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
return DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, ArgVal);
|
2012-10-23 15:51:16 +00:00
|
|
|
}
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue
|
2012-10-05 21:27:08 +00:00
|
|
|
PPCTargetLowering::LowerFormalArguments_64SVR4(
|
|
|
|
SDValue Chain,
|
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::InputArg>
|
|
|
|
&Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2012-10-05 21:27:08 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
// TODO: add description of PPC stack frame format, or at least some docs.
|
|
|
|
//
|
2014-06-20 16:34:05 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2012-10-05 21:27:08 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
|
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
|
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
|
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
(CallConv == CallingConv::Fast));
|
|
|
|
unsigned PtrByteSize = 8;
|
|
|
|
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
|
|
|
|
unsigned ArgOffset = LinkageSize;
|
2012-10-05 21:27:08 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR[] = {
|
2012-10-05 21:27:08 +00:00
|
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
|
|
};
|
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg *FPR = GetFPR();
|
2012-10-05 21:27:08 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VR[] = {
|
2012-10-05 21:27:08 +00:00
|
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VSRH[] = {
|
2014-03-28 19:58:11 +00:00
|
|
|
PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
|
|
|
|
PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
|
|
|
|
};
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
const unsigned Num_GPR_Regs = array_lengthof(GPR);
|
|
|
|
const unsigned Num_FPR_Regs = 13;
|
|
|
|
const unsigned Num_VR_Regs = array_lengthof(VR);
|
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
// Add DAG nodes to load the arguments or copy them out of registers. On
|
|
|
|
// entry to a function on PPC, the arguments start after the linkage area,
|
|
|
|
// although the first ones are often in registers.
|
|
|
|
|
|
|
|
SmallVector<SDValue, 8> MemOps;
|
|
|
|
Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
|
2013-02-20 17:31:41 +00:00
|
|
|
unsigned CurArgIdx = 0;
|
|
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
|
2012-10-05 21:27:08 +00:00
|
|
|
SDValue ArgVal;
|
|
|
|
bool needsLoad = false;
|
|
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
unsigned ObjSize = ObjectVT.getStoreSize();
|
2012-10-05 21:27:08 +00:00
|
|
|
unsigned ArgSize = ObjSize;
|
|
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
2013-02-20 17:31:41 +00:00
|
|
|
std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
|
|
|
|
CurArgIdx = Ins[ArgNo].OrigArgIndex;
|
2012-10-05 21:27:08 +00:00
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
/* Respect alignment of argument on the stack. */
|
|
|
|
unsigned Align =
|
|
|
|
CalculateStackSlotAlignment(ObjectVT, Flags, PtrByteSize);
|
|
|
|
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
|
2012-10-05 21:27:08 +00:00
|
|
|
unsigned CurArgOffset = ArgOffset;
|
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
/* Compute GPR index associated with argument offset. */
|
|
|
|
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
|
|
GPR_idx = std::min(GPR_idx, Num_GPR_Regs);
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
// FIXME the codegen can be much improved in some cases.
|
|
|
|
// We do not have to keep everything in memory.
|
|
|
|
if (Flags.isByVal()) {
|
|
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
|
|
|
|
ObjSize = Flags.getByValSize();
|
|
|
|
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
2012-10-31 01:15:05 +00:00
|
|
|
// Empty aggregate parameters do not take up registers. Examples:
|
|
|
|
// struct { } a;
|
|
|
|
// union { } b;
|
|
|
|
// int c[0];
|
|
|
|
// etc. However, we have to provide a place-holder in InVals, so
|
|
|
|
// pretend we have an 8-byte item at the current address for that
|
|
|
|
// purpose.
|
|
|
|
if (!ObjSize) {
|
|
|
|
int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
InVals.push_back(FIN);
|
|
|
|
continue;
|
|
|
|
}
|
2013-09-12 23:20:06 +00:00
|
|
|
|
2012-10-05 21:27:08 +00:00
|
|
|
// All aggregates smaller than 8 bytes must be passed right-justified.
|
2014-06-20 16:34:05 +00:00
|
|
|
if (ObjSize < PtrByteSize && !isLittleEndian)
|
This patch addresses PR13949.
For the PowerPC 64-bit ELF Linux ABI, aggregates of size less than 8
bytes are to be passed in the low-order bits ("right-adjusted") of the
doubleword register or memory slot assigned to them. A previous patch
addressed this for aggregates passed in registers. However, small
aggregates passed in the overflow portion of the parameter save area are
still being passed left-adjusted.
The fix is made in PPCTargetLowering::LowerCall_Darwin_Or_64SVR4 on the
caller side, and in PPCTargetLowering::LowerFormalArguments_64SVR4 on
the callee side. The main fix on the callee side simply extends
existing logic for 1- and 2-byte objects to 1- through 7-byte objects,
and correcting a constant left over from 32-bit code. There is also a
fix to a bogus calculation of the offset to the following argument in
the parameter save area.
On the caller side, again a constant left over from 32-bit code is
fixed. Additionally, some code for 1, 2, and 4-byte objects is
duplicated to handle the 3, 5, 6, and 7-byte objects for SVR4 only. The
LowerCall_Darwin_Or_64SVR4 logic is getting fairly convoluted trying to
handle both ABIs, and I propose to separate this into two functions in a
future patch, at which time the duplication can be removed.
The patch adds a new test (structsinmem.ll) to demonstrate correct
passing of structures of all seven sizes. Eight dummy parameters are
used to force these structures to be in the overflow portion of the
parameter save area.
As a side effect, this corrects the case when aggregates passed in
registers are saved into the first eight doublewords of the parameter
save area: Previously they were stored left-justified, and now are
properly stored right-justified. This requires changing the expected
output of existing test case structsinregs.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166022 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-16 13:30:53 +00:00
|
|
|
CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
|
2012-10-05 21:27:08 +00:00
|
|
|
// The value of the object is its address.
|
|
|
|
int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
InVals.push_back(FIN);
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
|
|
|
|
if (ObjSize < 8) {
|
2012-10-05 21:27:08 +00:00
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
2012-10-05 21:27:08 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
SDValue Store;
|
|
|
|
|
|
|
|
if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
|
|
|
|
EVT ObjType = (ObjSize == 1 ? MVT::i8 :
|
|
|
|
(ObjSize == 2 ? MVT::i16 : MVT::i32));
|
|
|
|
Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
|
2014-01-21 20:15:58 +00:00
|
|
|
MachinePointerInfo(FuncArg),
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
ObjType, false, false, 0);
|
|
|
|
} else {
|
|
|
|
// For sizes that don't fit a truncating store (3, 5, 6, 7),
|
|
|
|
// store the whole register as-is to the parameter save area
|
|
|
|
// slot. The address of the parameter was already calculated
|
|
|
|
// above (InVals.push_back(FIN)) to be the right-justified
|
|
|
|
// offset within the slot. For this store, we need a new
|
|
|
|
// frame index that points at the beginning of the slot.
|
|
|
|
int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
2014-01-21 20:15:58 +00:00
|
|
|
MachinePointerInfo(FuncArg),
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
false, false, 0);
|
|
|
|
}
|
|
|
|
|
2012-10-05 21:27:08 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
}
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
// Whether we copied from a register or not, advance the offset
|
|
|
|
// into the parameter save area by a full doubleword.
|
2012-10-05 21:27:08 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
continue;
|
|
|
|
}
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
|
2012-10-05 21:27:08 +00:00
|
|
|
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
|
|
|
|
// Store whatever pieces of the object are in registers
|
|
|
|
// to memory. ArgOffset will be the address of the beginning
|
|
|
|
// of the object.
|
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
|
|
unsigned VReg;
|
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
|
|
int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
|
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
This patch addresses a PPC64 ELF issue with passing parameters consisting of
structs having size 3, 5, 6, or 7. Such a struct must be passed and received
as right-justified within its register or memory slot. The problem is only
present for structs that are passed in registers.
Previously, as part of a patch handling all structs of size less than 8, I
added logic to rotate the incoming register so that the struct was left-
justified prior to storing the whole register. This was incorrect because
the address of the parameter had already been adjusted earlier to point to
the right-adjusted value in the storage slot. Essentially I had accidentally
accounted for the right-adjustment twice.
In this patch, I removed the incorrect logic and reorganized the code to make
the flow clearer.
The removal of the rotates changes the expected code generation, so test case
structsinregs.ll has been modified to reflect this. I also added a new test
case, jaggedstructs.ll, to demonstrate that structs of these sizes can now
be properly received and passed.
I've built and tested the code on powerpc64-unknown-linux-gnu with no new
regressions. I also ran the GCC compatibility test suite and verified that
earlier problems with these structs are now resolved, with no new regressions.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166680 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-25 13:38:09 +00:00
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
2014-01-21 20:15:58 +00:00
|
|
|
MachinePointerInfo(FuncArg, j),
|
2012-10-05 21:27:08 +00:00
|
|
|
false, false, 0);
|
|
|
|
MemOps.push_back(Store);
|
|
|
|
++GPR_idx;
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
} else {
|
This patch addresses PR13949.
For the PowerPC 64-bit ELF Linux ABI, aggregates of size less than 8
bytes are to be passed in the low-order bits ("right-adjusted") of the
doubleword register or memory slot assigned to them. A previous patch
addressed this for aggregates passed in registers. However, small
aggregates passed in the overflow portion of the parameter save area are
still being passed left-adjusted.
The fix is made in PPCTargetLowering::LowerCall_Darwin_Or_64SVR4 on the
caller side, and in PPCTargetLowering::LowerFormalArguments_64SVR4 on
the callee side. The main fix on the callee side simply extends
existing logic for 1- and 2-byte objects to 1- through 7-byte objects,
and correcting a constant left over from 32-bit code. There is also a
fix to a bogus calculation of the offset to the following argument in
the parameter save area.
On the caller side, again a constant left over from 32-bit code is
fixed. Additionally, some code for 1, 2, and 4-byte objects is
duplicated to handle the 3, 5, 6, and 7-byte objects for SVR4 only. The
LowerCall_Darwin_Or_64SVR4 logic is getting fairly convoluted trying to
handle both ABIs, and I propose to separate this into two functions in a
future patch, at which time the duplication can be removed.
The patch adds a new test (structsinmem.ll) to demonstrate correct
passing of structures of all seven sizes. Eight dummy parameters are
used to force these structures to be in the overflow portion of the
parameter save area.
As a side effect, this corrects the case when aggregates passed in
registers are saved into the first eight doublewords of the parameter
save area: Previously they were stored left-justified, and now are
properly stored right-justified. This requires changing the expected
output of existing test case structsinregs.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166022 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-16 13:30:53 +00:00
|
|
|
ArgOffset += ArgSize - j;
|
2012-10-05 21:27:08 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (ObjectVT.getSimpleVT().SimpleTy) {
|
|
|
|
default: llvm_unreachable("Unhandled argument type!");
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case MVT::i1:
|
2012-10-05 21:27:08 +00:00
|
|
|
case MVT::i32:
|
|
|
|
case MVT::i64:
|
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
|
2012-10-05 21:27:08 +00:00
|
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
|
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
2012-10-23 15:51:16 +00:00
|
|
|
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
|
2012-10-05 21:27:08 +00:00
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
|
|
|
ArgSize = PtrByteSize;
|
|
|
|
}
|
|
|
|
ArgOffset += 8;
|
|
|
|
break;
|
|
|
|
|
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
|
|
|
if (FPR_idx != Num_FPR_Regs) {
|
|
|
|
unsigned VReg;
|
|
|
|
|
|
|
|
if (ObjectVT == MVT::f32)
|
|
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
|
|
|
|
else
|
2014-06-12 22:38:18 +00:00
|
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], Subtarget.hasVSX() ?
|
2014-03-29 05:29:01 +00:00
|
|
|
&PPC::VSFRCRegClass :
|
|
|
|
&PPC::F8RCRegClass);
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
|
|
++FPR_idx;
|
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
2012-10-11 15:38:20 +00:00
|
|
|
ArgSize = PtrByteSize;
|
2012-10-05 21:27:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
ArgOffset += 8;
|
|
|
|
break;
|
|
|
|
case MVT::v4f32:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v16i8:
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
case MVT::v2f64:
|
2014-03-26 16:12:58 +00:00
|
|
|
case MVT::v2i64:
|
2012-10-05 21:27:08 +00:00
|
|
|
if (VR_idx != Num_VR_Regs) {
|
2014-03-28 19:58:11 +00:00
|
|
|
unsigned VReg = (ObjectVT == MVT::v2f64 || ObjectVT == MVT::v2i64) ?
|
|
|
|
MF.addLiveIn(VSRH[VR_idx], &PPC::VSHRCRegClass) :
|
|
|
|
MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
|
2012-10-05 21:27:08 +00:00
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
|
|
|
++VR_idx;
|
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
|
|
|
}
|
2014-06-23 12:36:34 +00:00
|
|
|
ArgOffset += 16;
|
2012-10-05 21:27:08 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We need to load the argument to a virtual register if we determined
|
|
|
|
// above that we ran out of physical registers of the appropriate type.
|
|
|
|
if (needsLoad) {
|
2014-06-20 16:34:05 +00:00
|
|
|
if (ObjSize < ArgSize && !isLittleEndian)
|
|
|
|
CurArgOffset += ArgSize - ObjSize;
|
|
|
|
int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, isImmutable);
|
2012-10-05 21:27:08 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
|
|
|
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
|
|
|
|
false, false, false, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(ArgVal);
|
|
|
|
}
|
|
|
|
|
2014-06-23 13:08:27 +00:00
|
|
|
// Area that is at least reserved in the caller of this function.
|
2014-07-07 19:26:41 +00:00
|
|
|
unsigned MinReservedArea;
|
|
|
|
MinReservedArea = std::max(ArgOffset, LinkageSize + 8 * PtrByteSize);
|
2014-06-23 13:08:27 +00:00
|
|
|
|
2012-10-05 21:27:08 +00:00
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
2012-10-23 15:51:16 +00:00
|
|
|
// call optimized functions' reserved stack space needs to be aligned so that
|
2012-10-05 21:27:08 +00:00
|
|
|
// taking the difference between two stack areas will result in an aligned
|
|
|
|
// stack.
|
2014-06-23 13:08:27 +00:00
|
|
|
MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
|
|
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
|
|
if (isVarArg) {
|
|
|
|
int Depth = ArgOffset;
|
|
|
|
|
|
|
|
FuncInfo->setVarArgsFrameIndex(
|
2012-10-23 15:51:16 +00:00
|
|
|
MFI->CreateFixedObject(PtrByteSize, Depth, true));
|
2012-10-05 21:27:08 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
|
|
|
|
|
|
|
// If this function is vararg, store any remaining integer argument regs
|
|
|
|
// to their spots on the stack so that they may be loaded by deferencing the
|
|
|
|
// result of va_next.
|
2014-07-07 19:26:41 +00:00
|
|
|
for (GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
|
|
GPR_idx < Num_GPR_Regs; ++GPR_idx) {
|
2012-10-05 21:27:08 +00:00
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
|
|
|
MemOps.push_back(Store);
|
|
|
|
// Increment the address by four for the next argument to store
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue PtrOff = DAG.getConstant(PtrByteSize, PtrVT);
|
2012-10-05 21:27:08 +00:00
|
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!MemOps.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
2012-10-05 21:27:08 +00:00
|
|
|
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::LowerFormalArguments_Darwin(
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Chain,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::InputArg>
|
|
|
|
&Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
2006-05-16 18:18:50 +00:00
|
|
|
// TODO: add description of PPC stack frame format, or at least some docs.
|
|
|
|
//
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
2010-04-17 14:41:14 +00:00
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-08-11 20:47:22 +00:00
|
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
2008-04-30 09:16:33 +00:00
|
|
|
// Potential tail calls could cause overwriting of argument stack slots.
|
2011-12-02 22:16:29 +00:00
|
|
|
bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
(CallConv == CallingConv::Fast));
|
2006-11-28 14:53:52 +00:00
|
|
|
unsigned PtrByteSize = isPPC64 ? 8 : 4;
|
2006-11-16 22:43:37 +00:00
|
|
|
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
|
|
|
|
unsigned ArgOffset = LinkageSize;
|
2008-04-30 09:16:33 +00:00
|
|
|
// Area that is at least reserved in caller of this function.
|
|
|
|
unsigned MinReservedArea = ArgOffset;
|
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR_32[] = { // 32-bit registers.
|
2006-05-16 18:18:50 +00:00
|
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR_64[] = { // 64-bit registers.
|
2006-06-26 22:48:35 +00:00
|
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
|
|
};
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg *FPR = GetFPR();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VR[] = {
|
2006-05-16 18:18:50 +00:00
|
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
|
|
};
|
2006-06-26 22:48:35 +00:00
|
|
|
|
2007-09-07 04:06:50 +00:00
|
|
|
const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
|
2009-07-03 06:47:08 +00:00
|
|
|
const unsigned Num_FPR_Regs = 13;
|
2007-09-07 04:06:50 +00:00
|
|
|
const unsigned Num_VR_Regs = array_lengthof( VR);
|
2006-11-16 22:43:37 +00:00
|
|
|
|
|
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-03-14 17:41:26 +00:00
|
|
|
// In 32-bit non-varargs functions, the stack space for vectors is after the
|
|
|
|
// stack space for non-vectors. We do not use this space unless we have
|
|
|
|
// too many vectors to fit in registers, something that only occurs in
|
2009-02-17 22:15:04 +00:00
|
|
|
// constructed examples:), but we have to walk the arglist to figure
|
2008-03-14 17:41:26 +00:00
|
|
|
// that out...for the pathological case, compute VecArgOffset as the
|
|
|
|
// start of the vector parameter area. Computing VecArgOffset is the
|
|
|
|
// entire point of the following loop.
|
|
|
|
unsigned VecArgOffset = ArgOffset;
|
|
|
|
if (!isVarArg && !isPPC64) {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
|
2008-03-14 17:41:26 +00:00
|
|
|
++ArgNo) {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
2008-03-14 17:41:26 +00:00
|
|
|
|
2008-03-21 09:14:45 +00:00
|
|
|
if (Flags.isByVal()) {
|
2008-03-14 17:41:26 +00:00
|
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of regs.
|
2012-01-20 14:42:32 +00:00
|
|
|
unsigned ObjSize = Flags.getByValSize();
|
2009-02-17 22:15:04 +00:00
|
|
|
unsigned ArgSize =
|
2008-03-14 17:41:26 +00:00
|
|
|
((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
|
|
|
VecArgOffset += ArgSize;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
switch(ObjectVT.getSimpleVT().SimpleTy) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unhandled argument type!");
|
2014-02-28 01:17:25 +00:00
|
|
|
case MVT::i1:
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i32:
|
|
|
|
case MVT::f32:
|
2012-09-19 15:42:13 +00:00
|
|
|
VecArgOffset += 4;
|
2008-03-14 17:41:26 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i64: // PPC64
|
|
|
|
case MVT::f64:
|
2012-09-19 15:42:13 +00:00
|
|
|
// FIXME: We are guaranteed to be !isPPC64 at this point.
|
|
|
|
// Does MVT::i64 apply?
|
2008-03-14 17:41:26 +00:00
|
|
|
VecArgOffset += 8;
|
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::v4f32:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v16i8:
|
2008-03-14 17:41:26 +00:00
|
|
|
// Nothing to do, we're only looking at Nonvector args here.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// We've found where the vector parameter area in memory is. Skip the
|
|
|
|
// first 12 parameters; these don't use that memory.
|
|
|
|
VecArgOffset = ((VecArgOffset+15)/16)*16;
|
|
|
|
VecArgOffset += 12*16;
|
|
|
|
|
2006-05-16 18:18:50 +00:00
|
|
|
// Add DAG nodes to load the arguments or copy them out of registers. On
|
2006-11-16 22:43:37 +00:00
|
|
|
// entry to a function on PPC, the arguments start after the linkage area,
|
|
|
|
// although the first ones are often in registers.
|
2007-03-13 15:02:46 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SmallVector<SDValue, 8> MemOps;
|
2008-04-30 09:16:33 +00:00
|
|
|
unsigned nAltivecParamsAtEnd = 0;
|
2012-09-24 20:47:19 +00:00
|
|
|
Function::const_arg_iterator FuncArg = MF.getFunction()->arg_begin();
|
2013-05-08 17:22:33 +00:00
|
|
|
unsigned CurArgIdx = 0;
|
|
|
|
for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ArgVal;
|
2006-05-16 18:18:50 +00:00
|
|
|
bool needsLoad = false;
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT ObjectVT = Ins[ArgNo].VT;
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned ObjSize = ObjectVT.getSizeInBits()/8;
|
2006-11-29 13:37:09 +00:00
|
|
|
unsigned ArgSize = ObjSize;
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
|
2013-05-08 17:22:33 +00:00
|
|
|
std::advance(FuncArg, Ins[ArgNo].OrigArgIndex - CurArgIdx);
|
|
|
|
CurArgIdx = Ins[ArgNo].OrigArgIndex;
|
2006-05-16 18:18:50 +00:00
|
|
|
|
2006-05-16 18:51:52 +00:00
|
|
|
unsigned CurArgOffset = ArgOffset;
|
2008-03-07 20:27:40 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
// Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
|
|
|
|
ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
|
2008-04-30 09:16:33 +00:00
|
|
|
if (isVarArg || isPPC64) {
|
|
|
|
MinReservedArea = ((MinReservedArea+15)/16)*16;
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
MinReservedArea += CalculateStackSlotSize(ObjectVT,
|
2008-09-13 01:54:27 +00:00
|
|
|
Flags,
|
2008-04-30 09:16:33 +00:00
|
|
|
PtrByteSize);
|
|
|
|
} else nAltivecParamsAtEnd++;
|
|
|
|
} else
|
|
|
|
// Calculate min reserved area.
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
|
2008-09-13 01:54:27 +00:00
|
|
|
Flags,
|
2008-04-30 09:16:33 +00:00
|
|
|
PtrByteSize);
|
|
|
|
|
2008-03-07 20:27:40 +00:00
|
|
|
// FIXME the codegen can be much improved in some cases.
|
|
|
|
// We do not have to keep everything in memory.
|
2008-03-21 09:14:45 +00:00
|
|
|
if (Flags.isByVal()) {
|
2008-03-07 20:27:40 +00:00
|
|
|
// ObjSize is the true size, ArgSize rounded up to multiple of registers.
|
2008-03-21 09:14:45 +00:00
|
|
|
ObjSize = Flags.getByValSize();
|
2008-03-07 20:27:40 +00:00
|
|
|
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
|
2012-10-05 21:27:08 +00:00
|
|
|
// Objects of size 1 and 2 are right justified, everything else is
|
|
|
|
// left justified. This means the memory address is adjusted forwards.
|
2008-03-08 01:41:42 +00:00
|
|
|
if (ObjSize==1 || ObjSize==2) {
|
|
|
|
CurArgOffset = CurArgOffset + (4 - ObjSize);
|
|
|
|
}
|
2008-03-07 20:27:40 +00:00
|
|
|
// The value of the object is its address.
|
2010-07-03 00:40:23 +00:00
|
|
|
int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
InVals.push_back(FIN);
|
2012-10-05 21:27:08 +00:00
|
|
|
if (ObjSize==1 || ObjSize==2) {
|
2008-03-08 01:41:42 +00:00
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
2011-06-17 15:21:10 +00:00
|
|
|
unsigned VReg;
|
|
|
|
if (isPPC64)
|
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
|
|
else
|
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
2012-10-23 15:51:16 +00:00
|
|
|
EVT ObjType = ObjSize == 1 ? MVT::i8 : MVT::i16;
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
|
2014-01-21 20:15:58 +00:00
|
|
|
MachinePointerInfo(FuncArg),
|
2012-09-19 15:42:13 +00:00
|
|
|
ObjType, false, false, 0);
|
2008-03-08 01:41:42 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
++GPR_idx;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2008-03-08 01:41:42 +00:00
|
|
|
continue;
|
|
|
|
}
|
2008-03-07 20:27:40 +00:00
|
|
|
for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
|
|
|
|
// Store whatever pieces of the object are in registers
|
2012-09-19 15:42:13 +00:00
|
|
|
// to memory. ArgOffset will be the address of the beginning
|
|
|
|
// of the object.
|
2008-03-07 20:27:40 +00:00
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
2011-06-17 15:21:10 +00:00
|
|
|
unsigned VReg;
|
|
|
|
if (isPPC64)
|
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
|
|
|
else
|
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
2010-07-03 00:40:23 +00:00
|
|
|
int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
2012-10-05 21:27:08 +00:00
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
2014-01-21 20:15:58 +00:00
|
|
|
MachinePointerInfo(FuncArg, j),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2008-03-07 20:27:40 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
++GPR_idx;
|
2009-07-03 06:47:08 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
2008-03-07 20:27:40 +00:00
|
|
|
} else {
|
|
|
|
ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
switch (ObjectVT.getSimpleVT().SimpleTy) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unhandled argument type!");
|
2014-02-28 01:17:25 +00:00
|
|
|
case MVT::i1:
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i32:
|
2008-03-07 20:49:02 +00:00
|
|
|
if (!isPPC64) {
|
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
2011-02-21 23:21:26 +00:00
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
2009-08-11 20:47:22 +00:00
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
|
2014-03-06 00:45:19 +00:00
|
|
|
|
|
|
|
if (ObjectVT == MVT::i1)
|
|
|
|
ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, ArgVal);
|
|
|
|
|
2008-03-07 20:49:02 +00:00
|
|
|
++GPR_idx;
|
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
|
|
|
ArgSize = PtrByteSize;
|
|
|
|
}
|
2009-07-03 06:47:08 +00:00
|
|
|
// All int arguments reserve stack space in the Darwin ABI.
|
|
|
|
ArgOffset += PtrByteSize;
|
2008-03-07 20:49:02 +00:00
|
|
|
break;
|
2006-05-16 18:18:50 +00:00
|
|
|
}
|
2008-03-07 20:49:02 +00:00
|
|
|
// FALLTHROUGH
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i64: // PPC64
|
2006-06-26 22:48:35 +00:00
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
2011-02-21 23:21:26 +00:00
|
|
|
unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
2009-08-11 20:47:22 +00:00
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
|
2008-03-07 20:49:02 +00:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (ObjectVT == MVT::i32 || ObjectVT == MVT::i1)
|
2008-03-07 20:49:02 +00:00
|
|
|
// PPC64 passes i8, i16, and i32 values in i64 registers. Promote
|
2009-08-11 20:47:22 +00:00
|
|
|
// value to MVT::i64 and then truncate to the correct register size.
|
2012-10-23 15:51:16 +00:00
|
|
|
ArgVal = extendArgForPPC64(Flags, ObjectVT, DAG, ArgVal, dl);
|
2008-03-07 20:49:02 +00:00
|
|
|
|
2006-06-26 22:48:35 +00:00
|
|
|
++GPR_idx;
|
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
2008-07-24 08:17:07 +00:00
|
|
|
ArgSize = PtrByteSize;
|
2006-06-26 22:48:35 +00:00
|
|
|
}
|
2009-07-03 06:47:08 +00:00
|
|
|
// All int arguments reserve stack space in the Darwin ABI.
|
|
|
|
ArgOffset += 8;
|
2006-06-26 22:48:35 +00:00
|
|
|
break;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
2006-05-16 18:51:52 +00:00
|
|
|
// Every 4 bytes of argument space consumes one of the GPRs available for
|
|
|
|
// argument passing.
|
2009-07-03 06:47:08 +00:00
|
|
|
if (GPR_idx != Num_GPR_Regs) {
|
2006-05-16 18:58:15 +00:00
|
|
|
++GPR_idx;
|
2006-11-18 01:57:19 +00:00
|
|
|
if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
|
2006-05-16 18:58:15 +00:00
|
|
|
++GPR_idx;
|
2006-05-16 18:51:52 +00:00
|
|
|
}
|
2006-05-16 18:58:15 +00:00
|
|
|
if (FPR_idx != Num_FPR_Regs) {
|
2006-05-16 18:18:50 +00:00
|
|
|
unsigned VReg;
|
2009-07-03 06:43:35 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
if (ObjectVT == MVT::f32)
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
|
2006-05-16 18:18:50 +00:00
|
|
|
else
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
|
2009-07-03 06:43:35 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
2006-05-16 18:18:50 +00:00
|
|
|
++FPR_idx;
|
|
|
|
} else {
|
|
|
|
needsLoad = true;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
// All FP arguments reserve stack space in the Darwin ABI.
|
|
|
|
ArgOffset += isPPC64 ? 8 : ObjSize;
|
2006-05-16 18:18:50 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::v4f32:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v16i8:
|
2008-03-12 00:22:17 +00:00
|
|
|
// Note that vector arguments in registers don't reserve stack space,
|
|
|
|
// except in varargs functions.
|
2006-05-16 18:58:15 +00:00
|
|
|
if (VR_idx != Num_VR_Regs) {
|
2011-02-21 23:21:26 +00:00
|
|
|
unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
|
2008-03-12 00:22:17 +00:00
|
|
|
if (isVarArg) {
|
|
|
|
while ((ArgOffset % 16) != 0) {
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
if (GPR_idx != Num_GPR_Regs)
|
|
|
|
GPR_idx++;
|
|
|
|
}
|
|
|
|
ArgOffset += 16;
|
2009-08-15 11:54:46 +00:00
|
|
|
GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
|
2008-03-12 00:22:17 +00:00
|
|
|
}
|
2006-05-16 18:18:50 +00:00
|
|
|
++VR_idx;
|
|
|
|
} else {
|
2008-03-14 17:41:26 +00:00
|
|
|
if (!isVarArg && !isPPC64) {
|
|
|
|
// Vectors go after all the nonvectors.
|
|
|
|
CurArgOffset = VecArgOffset;
|
|
|
|
VecArgOffset += 16;
|
|
|
|
} else {
|
|
|
|
// Vectors are aligned.
|
|
|
|
ArgOffset = ((ArgOffset+15)/16)*16;
|
|
|
|
CurArgOffset = ArgOffset;
|
|
|
|
ArgOffset += 16;
|
2008-03-12 00:49:20 +00:00
|
|
|
}
|
2006-05-16 18:18:50 +00:00
|
|
|
needsLoad = true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-05-16 18:18:50 +00:00
|
|
|
// We need to load the argument to a virtual register if we determined above
|
2008-02-13 07:35:30 +00:00
|
|
|
// that we ran out of physical registers of the appropriate type.
|
2006-05-16 18:18:50 +00:00
|
|
|
if (needsLoad) {
|
2008-02-13 07:35:30 +00:00
|
|
|
int FI = MFI->CreateFixedObject(ObjSize,
|
2008-04-30 09:16:33 +00:00
|
|
|
CurArgOffset + (ArgSize - ObjSize),
|
2010-07-03 00:40:23 +00:00
|
|
|
isImmutable);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
|
2010-09-21 06:44:06 +00:00
|
|
|
ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2006-05-16 18:18:50 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
InVals.push_back(ArgVal);
|
2006-05-16 18:18:50 +00:00
|
|
|
}
|
2008-03-07 20:27:40 +00:00
|
|
|
|
2014-06-23 13:08:27 +00:00
|
|
|
// Allow for Altivec parameters at the end, if needed.
|
|
|
|
if (nAltivecParamsAtEnd) {
|
|
|
|
MinReservedArea = ((MinReservedArea+15)/16)*16;
|
|
|
|
MinReservedArea += 16*nAltivecParamsAtEnd;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Area that is at least reserved in the caller of this function.
|
2014-06-23 14:15:53 +00:00
|
|
|
MinReservedArea = std::max(MinReservedArea, LinkageSize + 8 * PtrByteSize);
|
2014-06-23 13:08:27 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
// Set the size that is at least reserved in caller of this function. Tail
|
2012-10-23 15:51:16 +00:00
|
|
|
// call optimized functions' reserved stack space needs to be aligned so that
|
2008-04-30 09:16:33 +00:00
|
|
|
// taking the difference between two stack areas will result in an aligned
|
|
|
|
// stack.
|
2014-06-23 13:08:27 +00:00
|
|
|
MinReservedArea = EnsureStackAlignment(MF.getTarget(), MinReservedArea);
|
|
|
|
FuncInfo->setMinReservedArea(MinReservedArea);
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2006-05-16 18:18:50 +00:00
|
|
|
// If the function takes variable number of arguments, make a frame index for
|
|
|
|
// the start of the first vararg value... for expansion of llvm.va_start.
|
|
|
|
if (isVarArg) {
|
2009-07-03 06:47:08 +00:00
|
|
|
int Depth = ArgOffset;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2010-04-17 14:41:14 +00:00
|
|
|
FuncInfo->setVarArgsFrameIndex(
|
|
|
|
MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
|
2010-07-03 00:40:23 +00:00
|
|
|
Depth, true));
|
2010-04-17 14:41:14 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-05-16 18:18:50 +00:00
|
|
|
// If this function is vararg, store any remaining integer argument regs
|
|
|
|
// to their spots on the stack so that they may be loaded by deferencing the
|
|
|
|
// result of va_next.
|
2006-05-16 18:58:15 +00:00
|
|
|
for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
|
2006-11-18 01:57:19 +00:00
|
|
|
unsigned VReg;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2006-11-18 01:57:19 +00:00
|
|
|
if (isPPC64)
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
|
2006-11-18 01:57:19 +00:00
|
|
|
else
|
2011-02-21 23:21:26 +00:00
|
|
|
VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
|
2006-11-18 01:57:19 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
|
2010-09-21 18:41:36 +00:00
|
|
|
SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
2006-05-16 18:18:50 +00:00
|
|
|
MemOps.push_back(Store);
|
|
|
|
// Increment the address by four for the next argument to store
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
|
2009-02-04 02:34:38 +00:00
|
|
|
FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
|
2006-05-16 18:18:50 +00:00
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-03-07 20:27:40 +00:00
|
|
|
if (!MemOps.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return Chain;
|
2006-05-16 18:18:50 +00:00
|
|
|
}
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
|
2011-04-15 05:18:47 +00:00
|
|
|
/// adjusted to accommodate the arguments for the tailcall.
|
2009-11-24 01:09:07 +00:00
|
|
|
static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
|
2008-04-30 09:16:33 +00:00
|
|
|
unsigned ParamSize) {
|
|
|
|
|
2009-11-24 01:09:07 +00:00
|
|
|
if (!isTailCall) return 0;
|
2008-04-30 09:16:33 +00:00
|
|
|
|
|
|
|
PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
|
|
|
|
unsigned CallerMinReservedArea = FI->getMinReservedArea();
|
|
|
|
int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
|
|
|
|
// Remember only if the new adjustement is bigger.
|
|
|
|
if (SPDiff < FI->getTailCallSPDelta())
|
|
|
|
FI->setTailCallSPDelta(SPDiff);
|
|
|
|
|
|
|
|
return SPDiff;
|
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
|
|
|
|
/// for tail call optimization. Targets which want to do tail call
|
|
|
|
/// optimization should implement this function.
|
2008-04-30 09:16:33 +00:00
|
|
|
bool
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CalleeCC,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2008-04-30 09:16:33 +00:00
|
|
|
SelectionDAG& DAG) const {
|
2011-12-02 22:16:29 +00:00
|
|
|
if (!getTargetMachine().Options.GuaranteedTailCallOpt)
|
2010-01-29 23:05:56 +00:00
|
|
|
return false;
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
// Variable argument functions are not supported.
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
if (isVarArg)
|
2008-09-12 16:56:44 +00:00
|
|
|
return false;
|
2008-04-30 09:16:33 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
|
|
|
|
// Functions containing by val parameters are not supported.
|
|
|
|
for (unsigned i = 0; i != Ins.size(); i++) {
|
|
|
|
ISD::ArgFlagsTy Flags = Ins[i].Flags;
|
|
|
|
if (Flags.isByVal()) return false;
|
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2013-12-05 05:44:44 +00:00
|
|
|
// Non-PIC/GOT tail calls are supported.
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
|
|
|
|
return true;
|
2008-04-30 09:16:33 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
// At the moment we can only do local tail calls (in same module, hidden
|
|
|
|
// or protected) if we are generating PIC.
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
|
|
|
|
return G->getGlobal()->hasHiddenVisibility()
|
|
|
|
|| G->getGlobal()->hasProtectedVisibility();
|
2008-04-30 09:16:33 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2006-05-17 19:00:46 +00:00
|
|
|
/// isCallCompatibleAddress - Return the immediate to use if the specified
|
|
|
|
/// 32-bit value is representable in the immediate field of a BxA instruction.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
|
2006-05-17 19:00:46 +00:00
|
|
|
ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!C) return nullptr;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-09-12 16:56:44 +00:00
|
|
|
int Addr = C->getZExtValue();
|
2006-05-17 19:00:46 +00:00
|
|
|
if ((Addr & 3) != 0 || // Low 2 bits are implicitly zero.
|
2012-08-24 23:29:28 +00:00
|
|
|
SignExtend32<26>(Addr) != Addr)
|
2014-04-25 05:30:21 +00:00
|
|
|
return nullptr; // Top 6 bits have to be sext of immediate.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-09-12 16:56:44 +00:00
|
|
|
return DAG.getConstant((int)C->getZExtValue() >> 2,
|
2008-08-28 21:40:38 +00:00
|
|
|
DAG.getTargetLoweringInfo().getPointerTy()).getNode();
|
2006-05-17 19:00:46 +00:00
|
|
|
}
|
|
|
|
|
2008-05-13 00:00:25 +00:00
|
|
|
namespace {
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
struct TailCallArgumentInfo {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Arg;
|
|
|
|
SDValue FrameIdxOp;
|
2008-04-30 09:16:33 +00:00
|
|
|
int FrameIdx;
|
|
|
|
|
|
|
|
TailCallArgumentInfo() : FrameIdx(0) {}
|
|
|
|
};
|
|
|
|
|
2008-05-13 00:00:25 +00:00
|
|
|
}
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
|
|
|
|
static void
|
|
|
|
StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
|
2009-10-18 18:16:27 +00:00
|
|
|
SDValue Chain,
|
2013-07-14 04:42:23 +00:00
|
|
|
const SmallVectorImpl<TailCallArgumentInfo> &TailCallArgs,
|
|
|
|
SmallVectorImpl<SDValue> &MemOpChains,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) {
|
2008-04-30 09:16:33 +00:00
|
|
|
for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Arg = TailCallArgs[i].Arg;
|
|
|
|
SDValue FIN = TailCallArgs[i].FrameIdxOp;
|
2008-04-30 09:16:33 +00:00
|
|
|
int FI = TailCallArgs[i].FrameIdx;
|
|
|
|
// Store relative to framepointer.
|
2009-02-04 20:06:27 +00:00
|
|
|
MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
|
2010-09-21 06:44:06 +00:00
|
|
|
MachinePointerInfo::getFixedStack(FI),
|
|
|
|
false, false, 0));
|
2008-04-30 09:16:33 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
|
|
|
|
/// the appropriate stack slot for the tail call optimized function call.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
|
2008-04-30 09:16:33 +00:00
|
|
|
MachineFunction &MF,
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Chain,
|
|
|
|
SDValue OldRetAddr,
|
|
|
|
SDValue OldFP,
|
2008-04-30 09:16:33 +00:00
|
|
|
int SPDiff,
|
|
|
|
bool isPPC64,
|
2009-07-03 06:47:08 +00:00
|
|
|
bool isDarwinABI,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) {
|
2008-04-30 09:16:33 +00:00
|
|
|
if (SPDiff) {
|
|
|
|
// Calculate the new stack slot for the return address.
|
|
|
|
int SlotSize = isPPC64 ? 8 : 4;
|
2011-01-10 12:39:04 +00:00
|
|
|
int NewRetAddrLoc = SPDiff + PPCFrameLowering::getReturnSaveOffset(isPPC64,
|
2009-07-03 06:47:08 +00:00
|
|
|
isDarwinABI);
|
2008-04-30 09:16:33 +00:00
|
|
|
int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
|
2010-07-03 00:40:23 +00:00
|
|
|
NewRetAddrLoc, true);
|
2009-08-11 20:47:22 +00:00
|
|
|
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
|
2009-02-04 20:06:27 +00:00
|
|
|
Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
|
2010-09-21 06:44:06 +00:00
|
|
|
MachinePointerInfo::getFixedStack(NewRetAddr),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
|
|
|
|
// slot as the FP is never overwritten.
|
2009-07-03 06:47:08 +00:00
|
|
|
if (isDarwinABI) {
|
2009-07-03 06:45:56 +00:00
|
|
|
int NewFPLoc =
|
2011-01-10 12:39:04 +00:00
|
|
|
SPDiff + PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
|
2009-11-12 20:49:22 +00:00
|
|
|
int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
|
2010-07-03 00:40:23 +00:00
|
|
|
true);
|
2009-07-03 06:45:56 +00:00
|
|
|
SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
|
|
|
|
Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
|
2010-09-21 06:44:06 +00:00
|
|
|
MachinePointerInfo::getFixedStack(NewFPIdx),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
}
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
|
|
|
|
/// the position of the argument.
|
|
|
|
static void
|
|
|
|
CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Arg, int SPDiff, unsigned ArgOffset,
|
2013-07-14 04:42:23 +00:00
|
|
|
SmallVectorImpl<TailCallArgumentInfo>& TailCallArguments) {
|
2008-04-30 09:16:33 +00:00
|
|
|
int Offset = ArgOffset + SPDiff;
|
2008-06-06 12:08:01 +00:00
|
|
|
uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
|
2010-07-03 00:40:23 +00:00
|
|
|
int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
|
2009-08-11 20:47:22 +00:00
|
|
|
EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FIN = DAG.getFrameIndex(FI, VT);
|
2008-04-30 09:16:33 +00:00
|
|
|
TailCallArgumentInfo Info;
|
|
|
|
Info.Arg = Arg;
|
|
|
|
Info.FrameIdxOp = FIN;
|
|
|
|
Info.FrameIdx = FI;
|
|
|
|
TailCallArguments.push_back(Info);
|
|
|
|
}
|
|
|
|
|
|
|
|
/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
|
|
|
|
/// stack slot. Returns the chain as result and the loaded frame pointers in
|
|
|
|
/// LROpOut/FPOpout. Used when tail calling.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
|
2009-02-04 20:06:27 +00:00
|
|
|
int SPDiff,
|
|
|
|
SDValue Chain,
|
|
|
|
SDValue &LROpOut,
|
|
|
|
SDValue &FPOpOut,
|
2009-07-03 06:47:08 +00:00
|
|
|
bool isDarwinABI,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) const {
|
2008-04-30 09:16:33 +00:00
|
|
|
if (SPDiff) {
|
|
|
|
// Load the LR and FP stack slot for later adjusting.
|
2014-06-12 22:38:18 +00:00
|
|
|
EVT VT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
|
2008-04-30 09:16:33 +00:00
|
|
|
LROpOut = getReturnAddrFrameIndex(DAG);
|
2010-09-21 06:44:06 +00:00
|
|
|
LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2008-08-28 21:40:38 +00:00
|
|
|
Chain = SDValue(LROpOut.getNode(), 1);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
|
|
|
|
// slot as the FP is never overwritten.
|
2009-07-03 06:47:08 +00:00
|
|
|
if (isDarwinABI) {
|
2009-07-03 06:45:56 +00:00
|
|
|
FPOpOut = getFramePointerFrameIndex(DAG);
|
2010-09-21 06:44:06 +00:00
|
|
|
FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2009-07-03 06:45:56 +00:00
|
|
|
Chain = SDValue(FPOpOut.getNode(), 1);
|
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
}
|
|
|
|
return Chain;
|
|
|
|
}
|
|
|
|
|
2008-03-04 23:17:14 +00:00
|
|
|
/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
|
2009-02-17 22:15:04 +00:00
|
|
|
/// by "Src" to address "Dst" of size "Size". Alignment information is
|
2008-03-04 23:17:14 +00:00
|
|
|
/// specified by the specific parameter attribute. The copy will be passed as
|
|
|
|
/// a byval function parameter.
|
|
|
|
/// Sometimes what we are copying is the end of a larger object, the part that
|
|
|
|
/// does not fit in registers.
|
2009-02-17 22:15:04 +00:00
|
|
|
static SDValue
|
2008-07-27 21:46:04 +00:00
|
|
|
CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
|
2008-03-21 09:14:45 +00:00
|
|
|
ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
|
2009-02-04 01:17:06 +00:00
|
|
|
return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
|
2014-04-15 07:22:52 +00:00
|
|
|
false, false, MachinePointerInfo(),
|
|
|
|
MachinePointerInfo());
|
2008-03-04 23:17:14 +00:00
|
|
|
}
|
2007-02-25 05:34:32 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
|
|
|
|
/// tail calls.
|
|
|
|
static void
|
2008-07-27 21:46:04 +00:00
|
|
|
LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
|
|
|
|
SDValue Arg, SDValue PtrOff, int SPDiff,
|
2008-04-30 09:16:33 +00:00
|
|
|
unsigned ArgOffset, bool isPPC64, bool isTailCall,
|
2013-07-14 04:42:23 +00:00
|
|
|
bool isVector, SmallVectorImpl<SDValue> &MemOpChains,
|
|
|
|
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2008-04-30 09:16:33 +00:00
|
|
|
if (!isTailCall) {
|
|
|
|
if (isVector) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue StackPtr;
|
2008-04-30 09:16:33 +00:00
|
|
|
if (isPPC64)
|
2009-08-11 20:47:22 +00:00
|
|
|
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
2008-04-30 09:16:33 +00:00
|
|
|
else
|
2009-08-11 20:47:22 +00:00
|
|
|
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
2009-02-04 20:06:27 +00:00
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
|
2008-04-30 09:16:33 +00:00
|
|
|
DAG.getConstant(ArgOffset, PtrVT));
|
|
|
|
}
|
2010-09-21 18:41:36 +00:00
|
|
|
MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
|
|
|
|
MachinePointerInfo(), false, false, 0));
|
2008-04-30 09:16:33 +00:00
|
|
|
// Calculate and remember argument location.
|
|
|
|
} else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
|
|
|
|
TailCallArguments);
|
|
|
|
}
|
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
static
|
|
|
|
void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
|
2009-07-03 06:47:08 +00:00
|
|
|
SDValue LROp, SDValue FPOp, bool isDarwinABI,
|
2013-07-14 04:42:23 +00:00
|
|
|
SmallVectorImpl<TailCallArgumentInfo> &TailCallArguments) {
|
2009-07-03 06:47:08 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
|
|
|
// Emit a sequence of copyto/copyfrom virtual registers for arguments that
|
|
|
|
// might overwrite each other in case of tail call optimization.
|
|
|
|
SmallVector<SDValue, 8> MemOpChains2;
|
2011-04-15 05:18:47 +00:00
|
|
|
// Do not flag preceding copytoreg stuff together with the following stuff.
|
2009-07-03 06:47:08 +00:00
|
|
|
InFlag = SDValue();
|
|
|
|
StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
|
|
|
|
MemOpChains2, dl);
|
|
|
|
if (!MemOpChains2.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
|
2009-07-03 06:47:08 +00:00
|
|
|
|
|
|
|
// Store the return address to the appropriate stack slot.
|
|
|
|
Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
|
|
|
|
isPPC64, isDarwinABI, dl);
|
|
|
|
|
|
|
|
// Emit callseq_end just before tailcall node.
|
|
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
2013-05-29 22:03:55 +00:00
|
|
|
DAG.getIntPtrConstant(0, true), InFlag, dl);
|
2009-07-03 06:47:08 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
static
|
|
|
|
unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDValue &Chain, SDLoc dl, int SPDiff, bool isTailCall,
|
2013-07-14 04:42:23 +00:00
|
|
|
SmallVectorImpl<std::pair<unsigned, SDValue> > &RegsToPass,
|
|
|
|
SmallVectorImpl<SDValue> &Ops, std::vector<EVT> &NodeTys,
|
2014-06-12 22:38:18 +00:00
|
|
|
const PPCSubtarget &Subtarget) {
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
bool isSVR4ABI = Subtarget.isSVR4ABI();
|
2010-11-14 23:42:06 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-08-11 20:47:22 +00:00
|
|
|
NodeTys.push_back(MVT::Other); // Returns a chain
|
2010-12-21 02:38:05 +00:00
|
|
|
NodeTys.push_back(MVT::Glue); // Returns a flag for retval copy to use.
|
2009-07-03 06:47:08 +00:00
|
|
|
|
2013-03-22 15:24:13 +00:00
|
|
|
unsigned CallOpc = PPCISD::CALL;
|
2009-07-03 06:47:08 +00:00
|
|
|
|
2010-08-04 20:47:44 +00:00
|
|
|
bool needIndirectCall = true;
|
2014-06-18 16:14:04 +00:00
|
|
|
if (!isSVR4ABI || !isPPC64)
|
|
|
|
if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG)) {
|
|
|
|
// If this is an absolute destination address, use the munged value.
|
|
|
|
Callee = SDValue(Dest, 0);
|
|
|
|
needIndirectCall = false;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
|
|
|
|
// XXX Work around for http://llvm.org/bugs/show_bug.cgi?id=5201
|
|
|
|
// Use indirect calls for ALL functions calls in JIT mode, since the
|
|
|
|
// far-call stubs may be outside relocation limits for a BL instruction.
|
|
|
|
if (!DAG.getTarget().getSubtarget<PPCSubtarget>().isJITCodeModel()) {
|
|
|
|
unsigned OpFlags = 0;
|
|
|
|
if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
|
2014-06-12 22:38:18 +00:00
|
|
|
(Subtarget.getTargetTriple().isMacOSX() &&
|
|
|
|
Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5)) &&
|
2010-11-14 23:42:06 +00:00
|
|
|
(G->getGlobal()->isDeclaration() ||
|
|
|
|
G->getGlobal()->isWeakForLinker())) {
|
|
|
|
// PC-relative references to external symbols should go through $stub,
|
|
|
|
// unless we're building with the leopard linker or later, which
|
|
|
|
// automatically synthesizes these stubs.
|
|
|
|
OpFlags = PPCII::MO_DARWIN_STUB;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
// If the callee is a GlobalAddress/ExternalSymbol node (quite common,
|
|
|
|
// every direct call is) turn it into a TargetGlobalAddress /
|
|
|
|
// TargetExternalSymbol node so that legalize doesn't hack it.
|
2010-08-04 20:47:44 +00:00
|
|
|
Callee = DAG.getTargetGlobalAddress(G->getGlobal(), dl,
|
2010-11-14 23:42:06 +00:00
|
|
|
Callee.getValueType(),
|
|
|
|
0, OpFlags);
|
2010-08-04 20:47:44 +00:00
|
|
|
needIndirectCall = false;
|
2010-11-23 03:31:01 +00:00
|
|
|
}
|
2010-08-04 20:47:44 +00:00
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-08-04 20:47:44 +00:00
|
|
|
if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
|
2010-11-14 23:42:06 +00:00
|
|
|
unsigned char OpFlags = 0;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
if (DAG.getTarget().getRelocationModel() != Reloc::Static &&
|
2014-06-12 22:38:18 +00:00
|
|
|
(Subtarget.getTargetTriple().isMacOSX() &&
|
|
|
|
Subtarget.getTargetTriple().isMacOSXVersionLT(10, 5))) {
|
2010-11-14 23:42:06 +00:00
|
|
|
// PC-relative references to external symbols should go through $stub,
|
|
|
|
// unless we're building with the leopard linker or later, which
|
|
|
|
// automatically synthesizes these stubs.
|
|
|
|
OpFlags = PPCII::MO_DARWIN_STUB;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType(),
|
|
|
|
OpFlags);
|
|
|
|
needIndirectCall = false;
|
2010-08-04 20:47:44 +00:00
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2010-08-04 20:47:44 +00:00
|
|
|
if (needIndirectCall) {
|
2009-07-03 06:47:08 +00:00
|
|
|
// Otherwise, this is an indirect call. We have to use a MTCTR/BCTRL pair
|
|
|
|
// to do the call, we can't use PPCISD::CALL.
|
|
|
|
SDValue MTCTROps[] = {Chain, Callee, InFlag};
|
2009-12-18 13:00:15 +00:00
|
|
|
|
|
|
|
if (isSVR4ABI && isPPC64) {
|
|
|
|
// Function pointers in the 64-bit SVR4 ABI do not point to the function
|
|
|
|
// entry point, but to the function descriptor (the function entry point
|
|
|
|
// address is part of the function descriptor though).
|
|
|
|
// The function descriptor is a three doubleword structure with the
|
|
|
|
// following fields: function entry point, TOC base address and
|
|
|
|
// environment pointer.
|
|
|
|
// Thus for a call through a function pointer, the following actions need
|
|
|
|
// to be performed:
|
|
|
|
// 1. Save the TOC of the caller in the TOC save area of its stack
|
2012-10-23 15:51:16 +00:00
|
|
|
// frame (this is done in LowerCall_Darwin() or LowerCall_64SVR4()).
|
2009-12-18 13:00:15 +00:00
|
|
|
// 2. Load the address of the function entry point from the function
|
|
|
|
// descriptor.
|
|
|
|
// 3. Load the TOC of the callee from the function descriptor into r2.
|
|
|
|
// 4. Load the environment pointer from the function descriptor into
|
|
|
|
// r11.
|
|
|
|
// 5. Branch to the function entry point address.
|
|
|
|
// 6. On return of the callee, the TOC of the caller needs to be
|
|
|
|
// restored (this is done in FinishCall()).
|
|
|
|
//
|
|
|
|
// All those operations are flagged together to ensure that no other
|
|
|
|
// operations can be scheduled in between. E.g. without flagging the
|
|
|
|
// operations together, a TOC access in the caller could be scheduled
|
|
|
|
// between the load of the callee TOC and the branch to the callee, which
|
|
|
|
// results in the TOC access going through the TOC of the callee instead
|
|
|
|
// of going through the TOC of the caller, which leads to incorrect code.
|
|
|
|
|
|
|
|
// Load the address of the function entry point from the function
|
|
|
|
// descriptor.
|
2010-12-21 02:38:05 +00:00
|
|
|
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
|
2014-04-26 18:35:24 +00:00
|
|
|
SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs,
|
2014-04-30 07:17:30 +00:00
|
|
|
makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
|
2009-12-18 13:00:15 +00:00
|
|
|
Chain = LoadFuncPtr.getValue(1);
|
|
|
|
InFlag = LoadFuncPtr.getValue(2);
|
|
|
|
|
|
|
|
// Load environment pointer into r11.
|
|
|
|
// Offset of the environment pointer within the function descriptor.
|
|
|
|
SDValue PtrOff = DAG.getIntPtrConstant(16);
|
|
|
|
|
|
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
|
|
|
|
SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr,
|
|
|
|
InFlag);
|
|
|
|
Chain = LoadEnvPtr.getValue(1);
|
|
|
|
InFlag = LoadEnvPtr.getValue(2);
|
|
|
|
|
|
|
|
SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
|
|
|
|
InFlag);
|
|
|
|
Chain = EnvVal.getValue(0);
|
|
|
|
InFlag = EnvVal.getValue(1);
|
|
|
|
|
|
|
|
// Load TOC of the callee into r2. We are using a target-specific load
|
|
|
|
// with r2 hard coded, because the result of a target-independent load
|
|
|
|
// would never go directly into r2, since r2 is a reserved register (which
|
|
|
|
// prevents the register allocator from allocating it), resulting in an
|
|
|
|
// additional register being allocated and an unnecessary move instruction
|
|
|
|
// being generated.
|
2010-12-21 02:38:05 +00:00
|
|
|
VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
[PowerPC] Simplify and improve loading into TOC register
During an indirect function call sequence on the 64-bit SVR4 ABI,
generate code must load and then restore the TOC register.
This does not use a regular LOAD instruction since the TOC
register r2 is marked as reserved. Instead, the are two
special instruction patterns:
let RST = 2, DS = 2 in
def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
"ld 2, 8($reg)", IIC_LdStLD,
[(PPCload_toc i64:$reg)]>, isPPC64;
let RST = 2, DS = 10, RA = 1 in
def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
"ld 2, 40(1)", IIC_LdStLD,
[(PPCtoc_restore)]>, isPPC64;
Note that these not only restrict the destination of the
load to r2, but they also restrict the *source* of the
load to particular address combinations. The latter is
a problem when we want to support the ELFv2 ABI, since
there the TOC save slot is no longer at 40(1).
This patch replaces those two instructions with a single
instruction pattern that only hard-codes r2 as destination,
but supports generic addresses as source. This will allow
supporting the ELFv2 ABI, and also helps generate more
efficient code for calls to absolute addresses (allowing
simplification of the ppc64-calls.ll test case).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211193 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-18 17:52:49 +00:00
|
|
|
SDValue TOCOff = DAG.getIntPtrConstant(8);
|
|
|
|
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, TOCOff);
|
2009-12-18 13:00:15 +00:00
|
|
|
SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
|
[PowerPC] Simplify and improve loading into TOC register
During an indirect function call sequence on the 64-bit SVR4 ABI,
generate code must load and then restore the TOC register.
This does not use a regular LOAD instruction since the TOC
register r2 is marked as reserved. Instead, the are two
special instruction patterns:
let RST = 2, DS = 2 in
def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
"ld 2, 8($reg)", IIC_LdStLD,
[(PPCload_toc i64:$reg)]>, isPPC64;
let RST = 2, DS = 10, RA = 1 in
def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
"ld 2, 40(1)", IIC_LdStLD,
[(PPCtoc_restore)]>, isPPC64;
Note that these not only restrict the destination of the
load to r2, but they also restrict the *source* of the
load to particular address combinations. The latter is
a problem when we want to support the ELFv2 ABI, since
there the TOC save slot is no longer at 40(1).
This patch replaces those two instructions with a single
instruction pattern that only hard-codes r2 as destination,
but supports generic addresses as source. This will allow
supporting the ELFv2 ABI, and also helps generate more
efficient code for calls to absolute addresses (allowing
simplification of the ppc64-calls.ll test case).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211193 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-18 17:52:49 +00:00
|
|
|
AddTOC, InFlag);
|
2009-12-18 13:00:15 +00:00
|
|
|
Chain = LoadTOCPtr.getValue(0);
|
|
|
|
InFlag = LoadTOCPtr.getValue(1);
|
|
|
|
|
|
|
|
MTCTROps[0] = Chain;
|
|
|
|
MTCTROps[1] = LoadFuncPtr;
|
|
|
|
MTCTROps[2] = InFlag;
|
|
|
|
}
|
|
|
|
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
|
2014-04-30 07:17:30 +00:00
|
|
|
makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
|
2009-07-03 06:47:08 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
|
|
|
|
NodeTys.clear();
|
2009-08-11 20:47:22 +00:00
|
|
|
NodeTys.push_back(MVT::Other);
|
2010-12-21 02:38:05 +00:00
|
|
|
NodeTys.push_back(MVT::Glue);
|
2009-07-03 06:47:08 +00:00
|
|
|
Ops.push_back(Chain);
|
2013-03-22 15:24:13 +00:00
|
|
|
CallOpc = PPCISD::BCTRL;
|
2014-04-25 05:30:21 +00:00
|
|
|
Callee.setNode(nullptr);
|
2013-03-22 15:24:13 +00:00
|
|
|
// Add use of X11 (holding environment pointer)
|
|
|
|
if (isSVR4ABI && isPPC64)
|
|
|
|
Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
|
2009-07-03 06:47:08 +00:00
|
|
|
// Add CTR register as callee so a bctr can be emitted later.
|
|
|
|
if (isTailCall)
|
2011-06-03 15:47:49 +00:00
|
|
|
Ops.push_back(DAG.getRegister(isPPC64 ? PPC::CTR8 : PPC::CTR, PtrVT));
|
2009-07-03 06:47:08 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// If this is a direct call, pass the chain and the callee.
|
|
|
|
if (Callee.getNode()) {
|
|
|
|
Ops.push_back(Chain);
|
|
|
|
Ops.push_back(Callee);
|
|
|
|
}
|
|
|
|
// If this is a tail call add stack pointer delta.
|
|
|
|
if (isTailCall)
|
2009-08-11 20:47:22 +00:00
|
|
|
Ops.push_back(DAG.getConstant(SPDiff, MVT::i32));
|
2009-07-03 06:47:08 +00:00
|
|
|
|
|
|
|
// Add argument registers to the end of the list so that they are known live
|
|
|
|
// into the call.
|
|
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
|
|
|
|
Ops.push_back(DAG.getRegister(RegsToPass[i].first,
|
|
|
|
RegsToPass[i].second.getValueType()));
|
|
|
|
|
|
|
|
return CallOpc;
|
|
|
|
}
|
|
|
|
|
2012-09-18 16:47:58 +00:00
|
|
|
static
|
|
|
|
bool isLocalCall(const SDValue &Callee)
|
|
|
|
{
|
|
|
|
if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
|
2012-09-18 18:27:49 +00:00
|
|
|
return !G->getGlobal()->isDeclaration() &&
|
|
|
|
!G->getGlobal()->isWeakForLinker();
|
2012-09-18 16:47:58 +00:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), RVLocs, *DAG.getContext());
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
|
2009-07-03 06:47:08 +00:00
|
|
|
|
|
|
|
// Copy all of the result registers out of their specified physreg.
|
|
|
|
for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
|
|
|
|
CCValAssign &VA = RVLocs[i];
|
|
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
2012-11-05 19:39:45 +00:00
|
|
|
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl,
|
|
|
|
VA.getLocReg(), VA.getLocVT(), InFlag);
|
|
|
|
Chain = Val.getValue(1);
|
|
|
|
InFlag = Val.getValue(2);
|
|
|
|
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
default: llvm_unreachable("Unknown loc info!");
|
|
|
|
case CCValAssign::Full: break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
|
|
|
|
DAG.getValueType(VA.getValVT()));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
|
|
|
|
DAG.getValueType(VA.getValVT()));
|
|
|
|
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
InVals.push_back(Val);
|
2009-07-03 06:47:08 +00:00
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return Chain;
|
2009-07-03 06:47:08 +00:00
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue
|
2013-05-25 02:42:55 +00:00
|
|
|
PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
|
2009-09-02 08:44:58 +00:00
|
|
|
bool isTailCall, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SelectionDAG &DAG,
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8>
|
|
|
|
&RegsToPass,
|
|
|
|
SDValue InFlag, SDValue Chain,
|
|
|
|
SDValue &Callee,
|
|
|
|
int SPDiff, unsigned NumBytes,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
std::vector<EVT> NodeTys;
|
2009-07-03 06:47:08 +00:00
|
|
|
SmallVector<SDValue, 8> Ops;
|
|
|
|
unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
|
|
|
|
isTailCall, RegsToPass, Ops, NodeTys,
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget);
|
2009-07-03 06:47:08 +00:00
|
|
|
|
2012-08-28 02:10:27 +00:00
|
|
|
// Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
|
2014-06-12 22:38:18 +00:00
|
|
|
if (isVarArg && Subtarget.isSVR4ABI() && !Subtarget.isPPC64())
|
2012-08-28 02:10:27 +00:00
|
|
|
Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
|
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
// When performing tail call optimization the callee pops its arguments off
|
|
|
|
// the stack. Account for this here so these bytes can be pushed back on in
|
2013-02-21 20:05:00 +00:00
|
|
|
// PPCFrameLowering::eliminateCallFramePseudoInstr.
|
2009-07-03 06:47:08 +00:00
|
|
|
int BytesCalleePops =
|
2011-12-02 22:16:29 +00:00
|
|
|
(CallConv == CallingConv::Fast &&
|
|
|
|
getTargetMachine().Options.GuaranteedTailCallOpt) ? NumBytes : 0;
|
2009-07-03 06:47:08 +00:00
|
|
|
|
2012-03-06 16:41:49 +00:00
|
|
|
// Add a register mask operand representing the call-preserved registers.
|
|
|
|
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
|
|
|
|
const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
|
|
|
|
assert(Mask && "Missing call preserved mask for calling convention");
|
|
|
|
Ops.push_back(DAG.getRegisterMask(Mask));
|
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
if (InFlag.getNode())
|
|
|
|
Ops.push_back(InFlag);
|
|
|
|
|
|
|
|
// Emit tail call.
|
|
|
|
if (isTailCall) {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
assert(((Callee.getOpcode() == ISD::Register &&
|
|
|
|
cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
|
|
|
|
Callee.getOpcode() == ISD::TargetExternalSymbol ||
|
|
|
|
Callee.getOpcode() == ISD::TargetGlobalAddress ||
|
|
|
|
isa<ConstantSDNode>(Callee)) &&
|
|
|
|
"Expecting an global address, external symbol, absolute value or register");
|
|
|
|
|
2014-04-26 18:35:24 +00:00
|
|
|
return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
|
2009-07-03 06:47:08 +00:00
|
|
|
}
|
|
|
|
|
2009-08-15 11:54:46 +00:00
|
|
|
// Add a NOP immediately after the branch instruction when using the 64-bit
|
|
|
|
// SVR4 ABI. At link time, if caller and callee are in a different module and
|
|
|
|
// thus have a different TOC, the call will be replaced with a call to a stub
|
|
|
|
// function which saves the current TOC, loads the TOC of the callee and
|
|
|
|
// branches to the callee. The NOP will be replaced with a load instruction
|
|
|
|
// which restores the TOC of the caller from the TOC save slot of the current
|
|
|
|
// stack frame. If caller and callee belong to the same module (and have the
|
|
|
|
// same TOC), the NOP will remain unchanged.
|
2012-03-31 14:45:15 +00:00
|
|
|
|
|
|
|
bool needsTOCRestore = false;
|
2014-06-12 22:38:18 +00:00
|
|
|
if (!isTailCall && Subtarget.isSVR4ABI()&& Subtarget.isPPC64()) {
|
2013-03-22 15:24:13 +00:00
|
|
|
if (CallOpc == PPCISD::BCTRL) {
|
2009-12-18 13:00:15 +00:00
|
|
|
// This is a call through a function pointer.
|
|
|
|
// Restore the caller TOC from the save area into R2.
|
|
|
|
// See PrepareCall() for more information about calls through function
|
|
|
|
// pointers in the 64-bit SVR4 ABI.
|
|
|
|
// We are using a target-specific load with r2 hard coded, because the
|
|
|
|
// result of a target-independent load would never go directly into r2,
|
|
|
|
// since r2 is a reserved register (which prevents the register allocator
|
|
|
|
// from allocating it), resulting in an additional register being
|
|
|
|
// allocated and an unnecessary move instruction being generated.
|
2012-03-31 14:45:15 +00:00
|
|
|
needsTOCRestore = true;
|
2013-09-26 17:09:28 +00:00
|
|
|
} else if ((CallOpc == PPCISD::CALL) &&
|
|
|
|
(!isLocalCall(Callee) ||
|
|
|
|
DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
|
2012-09-18 16:47:58 +00:00
|
|
|
// Otherwise insert NOP for non-local calls.
|
2013-03-22 15:24:13 +00:00
|
|
|
CallOpc = PPCISD::CALL_NOP;
|
2009-12-18 13:00:15 +00:00
|
|
|
}
|
2009-08-15 11:54:46 +00:00
|
|
|
}
|
|
|
|
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
|
2012-03-31 14:45:15 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
|
|
|
|
if (needsTOCRestore) {
|
|
|
|
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
[PowerPC] Simplify and improve loading into TOC register
During an indirect function call sequence on the 64-bit SVR4 ABI,
generate code must load and then restore the TOC register.
This does not use a regular LOAD instruction since the TOC
register r2 is marked as reserved. Instead, the are two
special instruction patterns:
let RST = 2, DS = 2 in
def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
"ld 2, 8($reg)", IIC_LdStLD,
[(PPCload_toc i64:$reg)]>, isPPC64;
let RST = 2, DS = 10, RA = 1 in
def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
"ld 2, 40(1)", IIC_LdStLD,
[(PPCtoc_restore)]>, isPPC64;
Note that these not only restrict the destination of the
load to r2, but they also restrict the *source* of the
load to particular address combinations. The latter is
a problem when we want to support the ELFv2 ABI, since
there the TOC save slot is no longer at 40(1).
This patch replaces those two instructions with a single
instruction pattern that only hard-codes r2 as destination,
but supports generic addresses as source. This will allow
supporting the ELFv2 ABI, and also helps generate more
efficient code for calls to absolute addresses (allowing
simplification of the ppc64-calls.ll test case).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211193 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-18 17:52:49 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
|
|
|
SDValue StackPtr = DAG.getRegister(PPC::X1, PtrVT);
|
|
|
|
unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
|
|
|
|
SDValue TOCOff = DAG.getIntPtrConstant(TOCSaveOffset);
|
|
|
|
SDValue AddTOC = DAG.getNode(ISD::ADD, dl, MVT::i64, StackPtr, TOCOff);
|
|
|
|
Chain = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain, AddTOC, InFlag);
|
2012-03-31 14:45:15 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
|
|
|
DAG.getIntPtrConstant(BytesCalleePops, true),
|
2013-05-29 22:03:55 +00:00
|
|
|
InFlag, dl);
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
if (!Ins.empty())
|
2009-07-03 06:47:08 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
|
|
|
|
Ins, dl, DAG, InVals);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue
|
2012-05-25 16:35:28 +00:00
|
|
|
PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
2012-05-25 16:35:28 +00:00
|
|
|
SelectionDAG &DAG = CLI.DAG;
|
2013-07-14 04:42:23 +00:00
|
|
|
SDLoc &dl = CLI.DL;
|
|
|
|
SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
|
|
|
|
SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
|
|
|
|
SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
|
2012-05-25 16:35:28 +00:00
|
|
|
SDValue Chain = CLI.Chain;
|
|
|
|
SDValue Callee = CLI.Callee;
|
|
|
|
bool &isTailCall = CLI.IsTailCall;
|
|
|
|
CallingConv::ID CallConv = CLI.CallConv;
|
|
|
|
bool isVarArg = CLI.IsVarArg;
|
|
|
|
|
2010-01-27 00:07:07 +00:00
|
|
|
if (isTailCall)
|
|
|
|
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
|
|
|
|
Ins, DAG);
|
|
|
|
|
2014-04-24 20:14:34 +00:00
|
|
|
if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
|
|
|
|
report_fatal_error("failed to perform tail call elimination on a call "
|
|
|
|
"site marked musttail");
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isSVR4ABI()) {
|
|
|
|
if (Subtarget.isPPC64())
|
2012-10-23 15:51:16 +00:00
|
|
|
return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
|
|
|
|
isTailCall, Outs, OutVals, Ins,
|
|
|
|
dl, DAG, InVals);
|
|
|
|
else
|
|
|
|
return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
|
|
|
|
isTailCall, Outs, OutVals, Ins,
|
|
|
|
dl, DAG, InVals);
|
|
|
|
}
|
2010-11-14 23:42:06 +00:00
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
|
|
|
|
isTailCall, Outs, OutVals, Ins,
|
|
|
|
dl, DAG, InVals);
|
2009-07-03 06:47:08 +00:00
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue
|
2012-09-19 15:42:13 +00:00
|
|
|
PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
|
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
|
|
|
bool isTailCall,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2012-09-19 15:42:13 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
// See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
|
2009-08-15 11:54:46 +00:00
|
|
|
// of the 32-bit SVR4 ABI stack frame layout.
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
|
|
|
|
assert((CallConv == CallingConv::C ||
|
|
|
|
CallConv == CallingConv::Fast) && "Unknown calling convention!");
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
unsigned PtrByteSize = 4;
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
|
|
// done because by tail calling the called function might overwrite the value
|
|
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
2011-12-02 22:16:29 +00:00
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
CallConv == CallingConv::Fast)
|
2009-07-03 06:45:56 +00:00
|
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
|
|
|
// area, parameter list area and the part of the local variable space which
|
|
|
|
// contains copies of aggregates which are passed by value.
|
|
|
|
|
|
|
|
// Assign locations to all of the outgoing arguments.
|
|
|
|
SmallVector<CCValAssign, 16> ArgLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), ArgLocs, *DAG.getContext());
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Reserve space for the linkage area on the stack.
|
2011-01-10 12:39:04 +00:00
|
|
|
CCInfo.AllocateStack(PPCFrameLowering::getLinkageSize(false, false), PtrByteSize);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
if (isVarArg) {
|
|
|
|
// Handle fixed and variable vector arguments differently.
|
|
|
|
// Fixed vector arguments go into registers as long as registers are
|
|
|
|
// available. Variable vector arguments always go into memory.
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
unsigned NumArgs = Outs.size();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
for (unsigned i = 0; i != NumArgs; ++i) {
|
2010-11-03 11:35:31 +00:00
|
|
|
MVT ArgVT = Outs[i].VT;
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
|
2009-07-03 06:45:56 +00:00
|
|
|
bool Result;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
if (Outs[i].IsFixed) {
|
2013-02-06 17:33:58 +00:00
|
|
|
Result = CC_PPC32_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
|
|
|
|
CCInfo);
|
2009-07-03 06:45:56 +00:00
|
|
|
} else {
|
2013-02-06 17:33:58 +00:00
|
|
|
Result = CC_PPC32_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
|
|
|
|
ArgFlags, CCInfo);
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
if (Result) {
|
2009-07-08 20:53:28 +00:00
|
|
|
#ifndef NDEBUG
|
2009-08-23 06:03:38 +00:00
|
|
|
errs() << "Call operand #" << i << " has unhandled type "
|
2010-11-03 11:35:31 +00:00
|
|
|
<< EVT(ArgVT).getEVTString() << "\n";
|
2009-07-08 20:53:28 +00:00
|
|
|
#endif
|
2014-04-28 04:05:08 +00:00
|
|
|
llvm_unreachable(nullptr);
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// All arguments are treated the same.
|
2013-02-06 17:33:58 +00:00
|
|
|
CCInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4);
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Assign locations to all of the outgoing aggregate by value arguments.
|
|
|
|
SmallVector<CCValAssign, 16> ByValArgLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCByValInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), ByValArgLocs, *DAG.getContext());
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Reserve stack space for the allocations in CCInfo.
|
|
|
|
CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
|
|
|
|
|
2013-02-06 17:33:58 +00:00
|
|
|
CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
|
|
|
// Size of the linkage area, parameter list area and the part of the local
|
|
|
|
// space variable where copies of aggregates which are passed by value are
|
|
|
|
// stored.
|
|
|
|
unsigned NumBytes = CCByValInfo.getNextStackOffset();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
|
|
// call optimization.
|
|
|
|
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
|
|
|
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
2013-05-29 22:03:55 +00:00
|
|
|
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
|
|
|
dl);
|
2009-07-03 06:45:56 +00:00
|
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
|
|
|
|
// Load the return address and frame pointer so it can be moved somewhere else
|
|
|
|
// later.
|
|
|
|
SDValue LROp, FPOp;
|
|
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false,
|
|
|
|
dl);
|
|
|
|
|
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
|
|
// arguments that may not fit in the registers available for argument
|
|
|
|
// passing.
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
|
|
|
2011-08-30 17:04:16 +00:00
|
|
|
bool seenFloatArg = false;
|
2009-07-03 06:45:56 +00:00
|
|
|
// Walk the register/memloc assignments, inserting copies/loads.
|
|
|
|
for (unsigned i = 0, j = 0, e = ArgLocs.size();
|
|
|
|
i != e;
|
|
|
|
++i) {
|
|
|
|
CCValAssign &VA = ArgLocs[i];
|
2010-07-07 15:54:55 +00:00
|
|
|
SDValue Arg = OutVals[i];
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
if (Flags.isByVal()) {
|
|
|
|
// Argument is an aggregate which is passed by value, thus we need to
|
|
|
|
// create a copy of it in the local variable space of the current stack
|
|
|
|
// frame (which is the stack frame of the caller) and pass the address of
|
|
|
|
// this copy to the callee.
|
|
|
|
assert((j < ByValArgLocs.size()) && "Index out of bounds!");
|
|
|
|
CCValAssign &ByValVA = ByValArgLocs[j++];
|
|
|
|
assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Memory reserved in the local variable space of the callers stack frame.
|
|
|
|
unsigned LocMemOffset = ByValVA.getLocMemOffset();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Create a copy of the argument in the local area of the current
|
|
|
|
// stack frame.
|
|
|
|
SDValue MemcpyCall =
|
|
|
|
CreateCopyOfByValArgument(Arg, PtrOff,
|
|
|
|
CallSeqStart.getNode()->getOperand(0),
|
|
|
|
Flags, DAG, dl);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// This must go outside the CALLSEQ_START..END.
|
|
|
|
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
|
2013-05-29 22:03:55 +00:00
|
|
|
CallSeqStart.getNode()->getOperand(1),
|
|
|
|
SDLoc(MemcpyCall));
|
2009-07-03 06:45:56 +00:00
|
|
|
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
|
|
|
|
NewCallSeqStart.getNode());
|
|
|
|
Chain = CallSeqStart = NewCallSeqStart;
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Pass the address of the aggregate copy on the stack either in a
|
|
|
|
// physical register or in the parameter list area of the current stack
|
|
|
|
// frame to the callee.
|
|
|
|
Arg = PtrOff;
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
if (VA.isRegLoc()) {
|
2014-03-06 00:23:33 +00:00
|
|
|
if (Arg.getValueType() == MVT::i1)
|
|
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Arg);
|
|
|
|
|
2011-08-30 17:04:16 +00:00
|
|
|
seenFloatArg |= VA.getLocVT().isFloatingPoint();
|
2009-07-03 06:45:56 +00:00
|
|
|
// Put argument in a physical register.
|
|
|
|
RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
|
|
|
|
} else {
|
|
|
|
// Put argument in the parameter list area of the current stack frame.
|
|
|
|
assert(VA.isMemLoc());
|
|
|
|
unsigned LocMemOffset = VA.getLocMemOffset();
|
|
|
|
|
|
|
|
if (!isTailCall) {
|
|
|
|
SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
|
|
|
|
|
|
|
|
MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
|
2010-09-21 18:41:36 +00:00
|
|
|
MachinePointerInfo(),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0));
|
2009-07-03 06:45:56 +00:00
|
|
|
} else {
|
|
|
|
// Calculate and remember argument location.
|
|
|
|
CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
|
|
|
|
TailCallArguments);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
if (!MemOpChains.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2009-07-03 06:45:56 +00:00
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
|
|
SDValue InFlag;
|
|
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
|
|
|
RegsToPass[i].second, InFlag);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2012-08-28 02:10:27 +00:00
|
|
|
// Set CR bit 6 to true if this is a vararg call with floating args passed in
|
|
|
|
// registers.
|
|
|
|
if (isVarArg) {
|
2012-08-30 15:52:29 +00:00
|
|
|
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
|
|
|
SDValue Ops[] = { Chain, InFlag };
|
|
|
|
|
2012-08-28 02:10:27 +00:00
|
|
|
Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
|
2014-04-30 07:17:30 +00:00
|
|
|
dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
|
2012-08-30 15:52:29 +00:00
|
|
|
|
2012-08-28 02:10:27 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
if (isTailCall)
|
2009-07-03 06:47:08 +00:00
|
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
|
|
|
|
false, TailCallArguments);
|
2009-07-03 06:45:56 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
|
|
|
|
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
|
|
|
|
Ins, InVals);
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
// Copy an argument into memory, being careful to do this outside the
|
|
|
|
// call sequence for the call to which the argument belongs.
|
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::createMemcpyOutsideCallSeq(SDValue Arg, SDValue PtrOff,
|
|
|
|
SDValue CallSeqStart,
|
|
|
|
ISD::ArgFlagsTy Flags,
|
|
|
|
SelectionDAG &DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) const {
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
|
|
|
|
CallSeqStart.getNode()->getOperand(0),
|
|
|
|
Flags, DAG, dl);
|
|
|
|
// The MEMCPY must go outside the CALLSEQ_START..END.
|
|
|
|
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
|
2013-05-29 22:03:55 +00:00
|
|
|
CallSeqStart.getNode()->getOperand(1),
|
|
|
|
SDLoc(MemcpyCall));
|
2012-10-23 15:51:16 +00:00
|
|
|
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
|
|
|
|
NewCallSeqStart.getNode());
|
|
|
|
return NewCallSeqStart;
|
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue
|
2012-10-23 15:51:16 +00:00
|
|
|
PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
bool isTailCall,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
2010-07-07 15:54:55 +00:00
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
|
2014-06-20 16:34:05 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2012-10-23 15:51:16 +00:00
|
|
|
unsigned NumOps = Outs.size();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2012-10-23 15:51:16 +00:00
|
|
|
unsigned PtrByteSize = 8;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
|
|
// done because by tail calling the called function might overwrite the value
|
|
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
2011-12-02 22:16:29 +00:00
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
CallConv == CallingConv::Fast)
|
2008-04-30 09:16:33 +00:00
|
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
|
|
|
|
2006-05-16 22:56:08 +00:00
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
2012-10-23 15:51:16 +00:00
|
|
|
// area, and parameter passing area. We start with at least 48 bytes, which
|
|
|
|
// is reserved space for [SP][CR][LR][3 x unused].
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned LinkageSize = PPCFrameLowering::getLinkageSize(true, false);
|
|
|
|
unsigned NumBytes = LinkageSize;
|
2014-06-23 13:08:27 +00:00
|
|
|
|
|
|
|
// Add up all the space actually used.
|
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
|
|
EVT ArgVT = Outs[i].VT;
|
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
/* Respect alignment of argument on the stack. */
|
|
|
|
unsigned Align = CalculateStackSlotAlignment(ArgVT, Flags, PtrByteSize);
|
|
|
|
NumBytes = ((NumBytes + Align - 1) / Align) * Align;
|
2014-06-23 13:08:27 +00:00
|
|
|
|
|
|
|
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
|
|
|
|
}
|
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
unsigned NumBytesActuallyUsed = NumBytes;
|
|
|
|
|
2014-06-23 13:08:27 +00:00
|
|
|
// The prolog code of the callee may store up to 8 GPR argument registers to
|
|
|
|
// the stack, allowing va_start to index over them in memory if its varargs.
|
|
|
|
// Because we cannot tell if this is needed on the caller side, we have to
|
|
|
|
// conservatively assume that it is needed. As such, make sure we have at
|
|
|
|
// least enough stack space for the caller to store the 8 GPRs.
|
2014-06-23 14:15:53 +00:00
|
|
|
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
|
2014-06-23 13:08:27 +00:00
|
|
|
|
|
|
|
// Tail call needs the stack to be aligned.
|
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
CallConv == CallingConv::Fast)
|
|
|
|
NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
|
2006-05-16 23:54:25 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
|
|
// call optimization.
|
|
|
|
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
// To protect arguments on the stack from being clobbered in a tail call,
|
|
|
|
// force all the loads to happen before doing any other lowering.
|
|
|
|
if (isTailCall)
|
|
|
|
Chain = DAG.getStackArgumentTokenFactor(Chain);
|
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
2013-05-29 22:03:55 +00:00
|
|
|
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
|
|
|
dl);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue CallSeqStart = Chain;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
// Load the return address and frame pointer so it can be move somewhere else
|
|
|
|
// later.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LROp, FPOp;
|
2009-07-03 06:45:56 +00:00
|
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
|
|
|
|
dl);
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
|
|
// arguments that may not fit in the registers available for argument
|
|
|
|
// passing.
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
// Figure out which arguments are going to go in registers, and which in
|
|
|
|
// memory. Also, if this is a vararg function, floating point operations
|
|
|
|
// must be stored to our stack, and loaded into integer regs as well, if
|
|
|
|
// any integer regs are available for argument passing.
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned ArgOffset = LinkageSize;
|
2014-07-07 19:26:41 +00:00
|
|
|
unsigned GPR_idx, FPR_idx = 0, VR_idx = 0;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR[] = {
|
2006-06-26 22:48:35 +00:00
|
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg *FPR = GetFPR();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VR[] = {
|
2006-05-17 06:01:33 +00:00
|
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VSRH[] = {
|
2014-03-28 19:58:11 +00:00
|
|
|
PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
|
|
|
|
PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
|
|
|
|
};
|
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
const unsigned NumGPRs = array_lengthof(GPR);
|
2009-07-03 06:47:08 +00:00
|
|
|
const unsigned NumFPRs = 13;
|
2009-07-03 06:43:35 +00:00
|
|
|
const unsigned NumVRs = array_lengthof(VR);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
2008-04-30 09:16:33 +00:00
|
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
2006-05-25 00:57:32 +00:00
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
2010-07-07 15:54:55 +00:00
|
|
|
SDValue Arg = OutVals[i];
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
2007-03-13 15:02:46 +00:00
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
/* Respect alignment of argument on the stack. */
|
|
|
|
unsigned Align =
|
|
|
|
CalculateStackSlotAlignment(Outs[i].VT, Flags, PtrByteSize);
|
|
|
|
ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
|
|
|
|
|
|
|
|
/* Compute GPR index associated with argument offset. */
|
|
|
|
GPR_idx = (ArgOffset - LinkageSize) / PtrByteSize;
|
|
|
|
GPR_idx = std::min(GPR_idx, NumGPRs);
|
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
// PtrOff will be used to store the current argument to the stack if a
|
|
|
|
// register cannot be found for it.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PtrOff;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-07-03 06:47:08 +00:00
|
|
|
PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
|
2007-03-13 15:02:46 +00:00
|
|
|
|
2009-02-04 02:34:38 +00:00
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
2006-06-26 22:48:35 +00:00
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
// Promote integers to 64-bit values.
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (Arg.getValueType() == MVT::i32 || Arg.getValueType() == MVT::i1) {
|
2008-03-21 09:14:45 +00:00
|
|
|
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
|
|
|
|
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
2009-08-11 20:47:22 +00:00
|
|
|
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
|
2006-06-26 22:48:35 +00:00
|
|
|
}
|
2008-03-04 23:17:14 +00:00
|
|
|
|
2008-03-07 20:27:40 +00:00
|
|
|
// FIXME memcpy is used way more than necessary. Correctness first.
|
2012-09-19 15:42:13 +00:00
|
|
|
// Note: "by value" is code for passing a structure by value, not
|
|
|
|
// basic types.
|
2008-03-21 09:14:45 +00:00
|
|
|
if (Flags.isByVal()) {
|
2012-09-19 15:42:13 +00:00
|
|
|
// Note: Size includes alignment padding, so
|
|
|
|
// struct x { short a; char b; }
|
|
|
|
// will have Size = 4. With #pragma pack(1), it will have Size = 3.
|
|
|
|
// These are the proper values we need for right-justifying the
|
2012-10-23 15:51:16 +00:00
|
|
|
// aggregate in a parameter register.
|
2008-03-21 09:14:45 +00:00
|
|
|
unsigned Size = Flags.getByValSize();
|
2012-10-31 01:15:05 +00:00
|
|
|
|
|
|
|
// An empty aggregate parameter takes up no storage and no
|
|
|
|
// registers.
|
|
|
|
if (Size == 0)
|
|
|
|
continue;
|
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
// All aggregates smaller than 8 bytes must be passed right-justified.
|
|
|
|
if (Size==1 || Size==2 || Size==4) {
|
2012-09-19 15:42:13 +00:00
|
|
|
EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
|
2008-03-07 20:27:40 +00:00
|
|
|
if (GPR_idx != NumGPRs) {
|
2011-02-16 16:23:55 +00:00
|
|
|
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
|
2010-09-21 17:04:51 +00:00
|
|
|
MachinePointerInfo(), VT,
|
|
|
|
false, false, 0);
|
2008-03-07 20:27:40 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2014-07-07 19:26:41 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
|
2009-07-03 06:47:08 +00:00
|
|
|
|
|
|
|
ArgOffset += PtrByteSize;
|
2012-10-23 15:51:16 +00:00
|
|
|
continue;
|
2008-03-07 20:27:40 +00:00
|
|
|
}
|
2012-10-23 15:51:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (GPR_idx == NumGPRs && Size < 8) {
|
2014-06-20 16:34:05 +00:00
|
|
|
SDValue AddPtr = PtrOff;
|
|
|
|
if (!isLittleEndian) {
|
|
|
|
SDValue Const = DAG.getConstant(PtrByteSize - Size,
|
|
|
|
PtrOff.getValueType());
|
|
|
|
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
|
|
}
|
2012-10-23 15:51:16 +00:00
|
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
|
|
CallSeqStart,
|
|
|
|
Flags, DAG, dl);
|
This patch addresses PR13949.
For the PowerPC 64-bit ELF Linux ABI, aggregates of size less than 8
bytes are to be passed in the low-order bits ("right-adjusted") of the
doubleword register or memory slot assigned to them. A previous patch
addressed this for aggregates passed in registers. However, small
aggregates passed in the overflow portion of the parameter save area are
still being passed left-adjusted.
The fix is made in PPCTargetLowering::LowerCall_Darwin_Or_64SVR4 on the
caller side, and in PPCTargetLowering::LowerFormalArguments_64SVR4 on
the callee side. The main fix on the callee side simply extends
existing logic for 1- and 2-byte objects to 1- through 7-byte objects,
and correcting a constant left over from 32-bit code. There is also a
fix to a bogus calculation of the offset to the following argument in
the parameter save area.
On the caller side, again a constant left over from 32-bit code is
fixed. Additionally, some code for 1, 2, and 4-byte objects is
duplicated to handle the 3, 5, 6, and 7-byte objects for SVR4 only. The
LowerCall_Darwin_Or_64SVR4 logic is getting fairly convoluted trying to
handle both ABIs, and I propose to separate this into two functions in a
future patch, at which time the duplication can be removed.
The patch adds a new test (structsinmem.ll) to demonstrate correct
passing of structures of all seven sizes. Eight dummy parameters are
used to force these structures to be in the overflow portion of the
parameter save area.
As a side effect, this corrects the case when aggregates passed in
registers are saved into the first eight doublewords of the parameter
save area: Previously they were stored left-justified, and now are
properly stored right-justified. This requires changing the expected
output of existing test case structsinregs.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166022 91177308-0d34-0410-b5e6-96231b3b80d8
2012-10-16 13:30:53 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
continue;
|
2008-03-07 20:27:40 +00:00
|
|
|
}
|
2008-03-17 02:13:43 +00:00
|
|
|
// Copy entire object into memory. There are cases where gcc-generated
|
|
|
|
// code assumes it is there, even if it could be put entirely into
|
|
|
|
// registers. (This is not what the doc says.)
|
2012-09-19 15:42:13 +00:00
|
|
|
|
|
|
|
// FIXME: The above statement is likely due to a misunderstanding of the
|
2012-10-23 15:51:16 +00:00
|
|
|
// documents. All arguments must be copied into the parameter area BY
|
|
|
|
// THE CALLEE in the event that the callee takes the address of any
|
|
|
|
// formal argument. That has not yet been implemented. However, it is
|
|
|
|
// reasonable to use the stack area as a staging area for the register
|
|
|
|
// load.
|
|
|
|
|
|
|
|
// Skip this for small aggregates, as we will use the same slot for a
|
|
|
|
// right-justified copy, below.
|
|
|
|
if (Size >= 8)
|
|
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
|
|
|
|
CallSeqStart,
|
|
|
|
Flags, DAG, dl);
|
|
|
|
|
|
|
|
// When a register is available, pass a small aggregate right-justified.
|
|
|
|
if (Size < 8 && GPR_idx != NumGPRs) {
|
2012-09-19 15:42:13 +00:00
|
|
|
// The easiest way to get this right-justified in a register
|
|
|
|
// is to copy the structure into the rightmost portion of a
|
|
|
|
// local variable slot, then load the whole slot into the
|
|
|
|
// register.
|
|
|
|
// FIXME: The memcpy seems to produce pretty awful code for
|
|
|
|
// small aggregates, particularly for packed ones.
|
2013-05-18 00:21:46 +00:00
|
|
|
// FIXME: It would be preferable to use the slot in the
|
2012-09-19 15:42:13 +00:00
|
|
|
// parameter save area instead of a new local variable.
|
2014-06-20 16:34:05 +00:00
|
|
|
SDValue AddPtr = PtrOff;
|
|
|
|
if (!isLittleEndian) {
|
|
|
|
SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
|
|
|
|
AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
|
|
}
|
2012-10-23 15:51:16 +00:00
|
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
|
|
CallSeqStart,
|
|
|
|
Flags, DAG, dl);
|
2012-09-19 15:42:13 +00:00
|
|
|
|
|
|
|
// Load the slot into the register.
|
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
|
|
|
|
MachinePointerInfo(),
|
|
|
|
false, false, false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2014-07-07 19:26:41 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
|
2012-09-19 15:42:13 +00:00
|
|
|
|
|
|
|
// Done with this argument.
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
// For aggregates larger than PtrByteSize, copy the pieces of the
|
|
|
|
// object that fit into registers from the parameter save area.
|
|
|
|
for (unsigned j=0; j<Size; j+=PtrByteSize) {
|
|
|
|
SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
|
|
|
|
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
|
|
|
|
if (GPR_idx != NumGPRs) {
|
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
|
|
|
|
MachinePointerInfo(),
|
|
|
|
false, false, false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
} else {
|
|
|
|
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-08-15 02:33:50 +00:00
|
|
|
switch (Arg.getSimpleValueType().SimpleTy) {
|
2012-10-23 15:51:16 +00:00
|
|
|
default: llvm_unreachable("Unexpected ValueType for argument!");
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case MVT::i1:
|
2012-10-23 15:51:16 +00:00
|
|
|
case MVT::i32:
|
|
|
|
case MVT::i64:
|
|
|
|
if (GPR_idx != NumGPRs) {
|
2014-07-07 19:26:41 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Arg));
|
2012-10-23 15:51:16 +00:00
|
|
|
} else {
|
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
true, isTailCall, false, MemOpChains,
|
|
|
|
TailCallArguments, dl);
|
|
|
|
}
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
break;
|
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
|
|
|
if (FPR_idx != NumFPRs) {
|
|
|
|
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
|
|
|
|
|
|
|
|
if (isVarArg) {
|
2012-10-29 21:18:16 +00:00
|
|
|
// A single float or an aggregate containing only a single float
|
|
|
|
// must be passed right-justified in the stack doubleword, and
|
|
|
|
// in the GPR, if one is available.
|
|
|
|
SDValue StoreOff;
|
2014-06-20 16:34:05 +00:00
|
|
|
if (Arg.getSimpleValueType().SimpleTy == MVT::f32 &&
|
|
|
|
!isLittleEndian) {
|
2012-10-29 21:18:16 +00:00
|
|
|
SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
|
|
|
|
StoreOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
|
|
|
|
} else
|
|
|
|
StoreOff = PtrOff;
|
|
|
|
|
|
|
|
SDValue Store = DAG.getStore(Chain, dl, Arg, StoreOff,
|
2012-10-23 15:51:16 +00:00
|
|
|
MachinePointerInfo(), false, false, 0);
|
|
|
|
MemOpChains.push_back(Store);
|
|
|
|
|
|
|
|
// Float varargs are always shadowed in available integer registers
|
|
|
|
if (GPR_idx != NumGPRs) {
|
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
|
|
|
|
MachinePointerInfo(), false, false,
|
|
|
|
false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2014-07-07 19:26:41 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx], Load));
|
2012-10-23 15:51:16 +00:00
|
|
|
}
|
2014-07-07 19:26:41 +00:00
|
|
|
}
|
2012-10-23 15:51:16 +00:00
|
|
|
} else {
|
|
|
|
// Single-precision floating-point values are mapped to the
|
|
|
|
// second (rightmost) word of the stack doubleword.
|
2014-06-20 16:34:05 +00:00
|
|
|
if (Arg.getValueType() == MVT::f32 && !isLittleEndian) {
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
|
|
|
|
}
|
|
|
|
|
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
true, isTailCall, false, MemOpChains,
|
|
|
|
TailCallArguments, dl);
|
|
|
|
}
|
|
|
|
ArgOffset += 8;
|
|
|
|
break;
|
|
|
|
case MVT::v4f32:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v16i8:
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
case MVT::v2f64:
|
2014-03-26 16:12:58 +00:00
|
|
|
case MVT::v2i64:
|
2014-06-23 12:36:34 +00:00
|
|
|
// For a varargs call, named arguments go into VRs or on the stack as
|
|
|
|
// usual; unnamed arguments always go to the stack or the corresponding
|
|
|
|
// GPRs when within range. For now, we always put the value in both
|
|
|
|
// locations (or even all three).
|
2012-10-23 15:51:16 +00:00
|
|
|
if (isVarArg) {
|
|
|
|
// We could elide this store in the case where the object fits
|
|
|
|
// entirely in R registers. Maybe later.
|
|
|
|
SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
|
|
|
MemOpChains.push_back(Store);
|
|
|
|
if (VR_idx != NumVRs) {
|
|
|
|
SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
|
|
|
|
MachinePointerInfo(),
|
|
|
|
false, false, false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2014-03-28 19:58:11 +00:00
|
|
|
|
|
|
|
unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
|
|
|
|
Arg.getSimpleValueType() == MVT::v2i64) ?
|
|
|
|
VSRH[VR_idx] : VR[VR_idx];
|
|
|
|
++VR_idx;
|
|
|
|
|
|
|
|
RegsToPass.push_back(std::make_pair(VReg, Load));
|
2012-10-23 15:51:16 +00:00
|
|
|
}
|
|
|
|
ArgOffset += 16;
|
|
|
|
for (unsigned i=0; i<16; i+=PtrByteSize) {
|
|
|
|
if (GPR_idx == NumGPRs)
|
|
|
|
break;
|
|
|
|
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
|
|
|
|
DAG.getConstant(i, PtrVT));
|
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
|
|
|
|
false, false, false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2014-06-23 12:36:34 +00:00
|
|
|
// Non-varargs Altivec params go into VRs or on the stack.
|
2012-10-23 15:51:16 +00:00
|
|
|
if (VR_idx != NumVRs) {
|
2014-03-28 19:58:11 +00:00
|
|
|
unsigned VReg = (Arg.getSimpleValueType() == MVT::v2f64 ||
|
|
|
|
Arg.getSimpleValueType() == MVT::v2i64) ?
|
|
|
|
VSRH[VR_idx] : VR[VR_idx];
|
|
|
|
++VR_idx;
|
|
|
|
|
|
|
|
RegsToPass.push_back(std::make_pair(VReg, Arg));
|
2012-10-23 15:51:16 +00:00
|
|
|
} else {
|
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
true, isTailCall, true, MemOpChains,
|
|
|
|
TailCallArguments, dl);
|
|
|
|
}
|
2014-06-23 12:36:34 +00:00
|
|
|
ArgOffset += 16;
|
2012-10-23 15:51:16 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-07-07 19:26:41 +00:00
|
|
|
assert(NumBytesActuallyUsed == ArgOffset);
|
2014-07-07 19:39:44 +00:00
|
|
|
(void)NumBytesActuallyUsed;
|
2014-07-07 19:26:41 +00:00
|
|
|
|
2012-10-23 15:51:16 +00:00
|
|
|
if (!MemOpChains.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
2012-10-23 15:51:16 +00:00
|
|
|
|
|
|
|
// Check if this is an indirect call (MTCTR/BCTRL).
|
|
|
|
// See PrepareCall() for more information about calls through function
|
|
|
|
// pointers in the 64-bit SVR4 ABI.
|
|
|
|
if (!isTailCall &&
|
|
|
|
!dyn_cast<GlobalAddressSDNode>(Callee) &&
|
2014-06-18 16:14:04 +00:00
|
|
|
!dyn_cast<ExternalSymbolSDNode>(Callee)) {
|
2012-10-23 15:51:16 +00:00
|
|
|
// Load r2 into a virtual register and store it to the TOC save area.
|
|
|
|
SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
|
|
|
|
// TOC save area offset.
|
[PowerPC] Simplify and improve loading into TOC register
During an indirect function call sequence on the 64-bit SVR4 ABI,
generate code must load and then restore the TOC register.
This does not use a regular LOAD instruction since the TOC
register r2 is marked as reserved. Instead, the are two
special instruction patterns:
let RST = 2, DS = 2 in
def LDinto_toc: DSForm_1a<58, 0, (outs), (ins g8rc:$reg),
"ld 2, 8($reg)", IIC_LdStLD,
[(PPCload_toc i64:$reg)]>, isPPC64;
let RST = 2, DS = 10, RA = 1 in
def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
"ld 2, 40(1)", IIC_LdStLD,
[(PPCtoc_restore)]>, isPPC64;
Note that these not only restrict the destination of the
load to r2, but they also restrict the *source* of the
load to particular address combinations. The latter is
a problem when we want to support the ELFv2 ABI, since
there the TOC save slot is no longer at 40(1).
This patch replaces those two instructions with a single
instruction pattern that only hard-codes r2 as destination,
but supports generic addresses as source. This will allow
supporting the ELFv2 ABI, and also helps generate more
efficient code for calls to absolute addresses (allowing
simplification of the ppc64-calls.ll test case).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211193 91177308-0d34-0410-b5e6-96231b3b80d8
2014-06-18 17:52:49 +00:00
|
|
|
unsigned TOCSaveOffset = PPCFrameLowering::getTOCSaveOffset();
|
|
|
|
SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset);
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
|
|
Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, MachinePointerInfo(),
|
|
|
|
false, false, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
|
|
|
SDValue InFlag;
|
|
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
|
|
|
RegsToPass[i].second, InFlag);
|
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (isTailCall)
|
|
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, true, SPDiff, NumBytes, LROp,
|
|
|
|
FPOp, true, TailCallArguments);
|
|
|
|
|
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
|
|
|
|
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
|
|
|
|
Ins, InVals);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
|
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
|
|
|
bool isTailCall,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
|
|
|
const SmallVectorImpl<ISD::InputArg> &Ins,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG,
|
2012-10-23 15:51:16 +00:00
|
|
|
SmallVectorImpl<SDValue> &InVals) const {
|
|
|
|
|
|
|
|
unsigned NumOps = Outs.size();
|
|
|
|
|
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
|
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
|
|
|
unsigned PtrByteSize = isPPC64 ? 8 : 4;
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
|
|
|
|
// Mark this function as potentially containing a function that contains a
|
|
|
|
// tail call. As a consequence the frame pointer will be used for dynamicalloc
|
|
|
|
// and restoring the callers stack pointer in this functions epilog. This is
|
|
|
|
// done because by tail calling the called function might overwrite the value
|
|
|
|
// in this function's (MF) stack pointer stack slot 0(SP).
|
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
CallConv == CallingConv::Fast)
|
|
|
|
MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
|
|
|
|
|
|
|
|
// Count how many bytes are to be pushed on the stack, including the linkage
|
|
|
|
// area, and parameter passing area. We start with 24/48 bytes, which is
|
|
|
|
// prereserved space for [SP][CR][LR][3 x unused].
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned LinkageSize = PPCFrameLowering::getLinkageSize(isPPC64, true);
|
|
|
|
unsigned NumBytes = LinkageSize;
|
2014-06-23 13:08:27 +00:00
|
|
|
|
|
|
|
// Add up all the space actually used.
|
|
|
|
// In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
|
|
|
|
// they all go in registers, but we must reserve stack space for them for
|
|
|
|
// possible use by the caller. In varargs or 64-bit calls, parameters are
|
|
|
|
// assigned stack space in order, with padding so Altivec parameters are
|
|
|
|
// 16-byte aligned.
|
|
|
|
unsigned nAltivecParamsAtEnd = 0;
|
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
|
|
EVT ArgVT = Outs[i].VT;
|
|
|
|
// Varargs Altivec parameters are padded to a 16 byte boundary.
|
|
|
|
if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
|
|
|
|
ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
|
|
|
|
ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
|
|
|
|
if (!isVarArg && !isPPC64) {
|
|
|
|
// Non-varargs Altivec parameters go after all the non-Altivec
|
|
|
|
// parameters; handle those later so we know how much padding we need.
|
|
|
|
nAltivecParamsAtEnd++;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
|
|
|
|
NumBytes = ((NumBytes+15)/16)*16;
|
|
|
|
}
|
|
|
|
NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allow for Altivec parameters at the end, if needed.
|
|
|
|
if (nAltivecParamsAtEnd) {
|
|
|
|
NumBytes = ((NumBytes+15)/16)*16;
|
|
|
|
NumBytes += 16*nAltivecParamsAtEnd;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The prolog code of the callee may store up to 8 GPR argument registers to
|
|
|
|
// the stack, allowing va_start to index over them in memory if its varargs.
|
|
|
|
// Because we cannot tell if this is needed on the caller side, we have to
|
|
|
|
// conservatively assume that it is needed. As such, make sure we have at
|
|
|
|
// least enough stack space for the caller to store the 8 GPRs.
|
2014-06-23 14:15:53 +00:00
|
|
|
NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
|
2014-06-23 13:08:27 +00:00
|
|
|
|
|
|
|
// Tail call needs the stack to be aligned.
|
|
|
|
if (getTargetMachine().Options.GuaranteedTailCallOpt &&
|
|
|
|
CallConv == CallingConv::Fast)
|
|
|
|
NumBytes = EnsureStackAlignment(MF.getTarget(), NumBytes);
|
2012-10-23 15:51:16 +00:00
|
|
|
|
|
|
|
// Calculate by how many bytes the stack has to be adjusted in case of tail
|
|
|
|
// call optimization.
|
|
|
|
int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
|
|
|
|
|
|
|
|
// To protect arguments on the stack from being clobbered in a tail call,
|
|
|
|
// force all the loads to happen before doing any other lowering.
|
|
|
|
if (isTailCall)
|
|
|
|
Chain = DAG.getStackArgumentTokenFactor(Chain);
|
|
|
|
|
|
|
|
// Adjust the stack pointer for the new arguments...
|
|
|
|
// These operations are automatically eliminated by the prolog/epilog pass
|
2013-05-29 22:03:55 +00:00
|
|
|
Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
|
|
|
|
dl);
|
2012-10-23 15:51:16 +00:00
|
|
|
SDValue CallSeqStart = Chain;
|
|
|
|
|
|
|
|
// Load the return address and frame pointer so it can be move somewhere else
|
|
|
|
// later.
|
|
|
|
SDValue LROp, FPOp;
|
|
|
|
Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
|
|
|
|
dl);
|
|
|
|
|
|
|
|
// Set up a copy of the stack pointer for use loading and storing any
|
|
|
|
// arguments that may not fit in the registers available for argument
|
|
|
|
// passing.
|
|
|
|
SDValue StackPtr;
|
|
|
|
if (isPPC64)
|
|
|
|
StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
|
|
|
|
else
|
|
|
|
StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
|
|
|
|
|
|
|
|
// Figure out which arguments are going to go in registers, and which in
|
|
|
|
// memory. Also, if this is a vararg function, floating point operations
|
|
|
|
// must be stored to our stack, and loaded into integer regs as well, if
|
|
|
|
// any integer regs are available for argument passing.
|
2014-06-23 14:15:53 +00:00
|
|
|
unsigned ArgOffset = LinkageSize;
|
2012-10-23 15:51:16 +00:00
|
|
|
unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
|
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR_32[] = { // 32-bit registers.
|
2012-10-23 15:51:16 +00:00
|
|
|
PPC::R3, PPC::R4, PPC::R5, PPC::R6,
|
|
|
|
PPC::R7, PPC::R8, PPC::R9, PPC::R10,
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg GPR_64[] = { // 64-bit registers.
|
2012-10-23 15:51:16 +00:00
|
|
|
PPC::X3, PPC::X4, PPC::X5, PPC::X6,
|
|
|
|
PPC::X7, PPC::X8, PPC::X9, PPC::X10,
|
|
|
|
};
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg *FPR = GetFPR();
|
2012-10-23 15:51:16 +00:00
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
static const MCPhysReg VR[] = {
|
2012-10-23 15:51:16 +00:00
|
|
|
PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
|
|
|
|
PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
|
|
|
|
};
|
|
|
|
const unsigned NumGPRs = array_lengthof(GPR_32);
|
|
|
|
const unsigned NumFPRs = 13;
|
|
|
|
const unsigned NumVRs = array_lengthof(VR);
|
|
|
|
|
2014-04-04 05:16:06 +00:00
|
|
|
const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
|
2012-10-23 15:51:16 +00:00
|
|
|
|
|
|
|
SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
|
|
|
|
SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
|
|
|
|
|
|
|
|
SmallVector<SDValue, 8> MemOpChains;
|
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
|
|
|
SDValue Arg = OutVals[i];
|
|
|
|
ISD::ArgFlagsTy Flags = Outs[i].Flags;
|
|
|
|
|
|
|
|
// PtrOff will be used to store the current argument to the stack if a
|
|
|
|
// register cannot be found for it.
|
|
|
|
SDValue PtrOff;
|
|
|
|
|
|
|
|
PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
|
|
|
|
|
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
|
|
|
|
|
|
|
|
// On PPC64, promote integers to 64-bit values.
|
|
|
|
if (isPPC64 && Arg.getValueType() == MVT::i32) {
|
|
|
|
// FIXME: Should this use ANY_EXTEND if neither sext nor zext?
|
|
|
|
unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
|
|
|
Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME memcpy is used way more than necessary. Correctness first.
|
|
|
|
// Note: "by value" is code for passing a structure by value, not
|
|
|
|
// basic types.
|
|
|
|
if (Flags.isByVal()) {
|
|
|
|
unsigned Size = Flags.getByValSize();
|
|
|
|
// Very small objects are passed right-justified. Everything else is
|
|
|
|
// passed left-justified.
|
|
|
|
if (Size==1 || Size==2) {
|
|
|
|
EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
|
|
|
|
if (GPR_idx != NumGPRs) {
|
|
|
|
SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
|
|
|
|
MachinePointerInfo(), VT,
|
|
|
|
false, false, 0);
|
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
} else {
|
|
|
|
SDValue Const = DAG.getConstant(PtrByteSize - Size,
|
|
|
|
PtrOff.getValueType());
|
|
|
|
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
|
|
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, AddPtr,
|
|
|
|
CallSeqStart,
|
|
|
|
Flags, DAG, dl);
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// Copy entire object into memory. There are cases where gcc-generated
|
|
|
|
// code assumes it is there, even if it could be put entirely into
|
|
|
|
// registers. (This is not what the doc says.)
|
|
|
|
Chain = CallSeqStart = createMemcpyOutsideCallSeq(Arg, PtrOff,
|
|
|
|
CallSeqStart,
|
|
|
|
Flags, DAG, dl);
|
|
|
|
|
2012-09-19 15:42:13 +00:00
|
|
|
// For small aggregates (Darwin only) and aggregates >= PtrByteSize,
|
|
|
|
// copy the pieces of the object that fit into registers from the
|
|
|
|
// parameter save area.
|
2008-03-04 23:17:14 +00:00
|
|
|
for (unsigned j=0; j<Size; j+=PtrByteSize) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
|
2009-02-04 02:34:38 +00:00
|
|
|
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
|
2008-03-04 23:17:14 +00:00
|
|
|
if (GPR_idx != NumGPRs) {
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
|
|
|
|
MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2008-03-05 23:31:27 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2008-03-04 23:17:14 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
2009-07-03 06:47:08 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
2008-03-04 23:17:14 +00:00
|
|
|
} else {
|
2008-03-17 02:13:43 +00:00
|
|
|
ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
|
2008-03-07 20:27:40 +00:00
|
|
|
break;
|
2008-03-04 23:17:14 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-08-15 02:33:50 +00:00
|
|
|
switch (Arg.getSimpleValueType().SimpleTy) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unexpected ValueType for argument!");
|
2014-02-28 01:17:25 +00:00
|
|
|
case MVT::i1:
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i32:
|
|
|
|
case MVT::i64:
|
2006-05-17 06:01:33 +00:00
|
|
|
if (GPR_idx != NumGPRs) {
|
2014-03-06 00:45:19 +00:00
|
|
|
if (Arg.getValueType() == MVT::i1)
|
|
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, PtrVT, Arg);
|
|
|
|
|
2006-05-17 06:01:33 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
|
2006-05-17 00:15:40 +00:00
|
|
|
} else {
|
2008-04-30 09:16:33 +00:00
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
isPPC64, isTailCall, false, MemOpChains,
|
2009-02-04 20:06:27 +00:00
|
|
|
TailCallArguments, dl);
|
2006-05-17 00:15:40 +00:00
|
|
|
}
|
2009-07-03 06:47:08 +00:00
|
|
|
ArgOffset += PtrByteSize;
|
2006-05-17 00:15:40 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
2006-05-17 06:01:33 +00:00
|
|
|
if (FPR_idx != NumFPRs) {
|
|
|
|
RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
|
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
if (isVarArg) {
|
2010-09-21 18:41:36 +00:00
|
|
|
SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
2006-05-17 06:01:33 +00:00
|
|
|
MemOpChains.push_back(Store);
|
|
|
|
|
2006-05-17 00:15:40 +00:00
|
|
|
// Float varargs are always shadowed in available integer registers
|
2006-05-17 06:01:33 +00:00
|
|
|
if (GPR_idx != NumGPRs) {
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
|
2011-11-08 18:42:53 +00:00
|
|
|
MachinePointerInfo(), false, false,
|
|
|
|
false, 0);
|
2006-05-17 06:01:33 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2009-07-03 06:47:08 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
2006-05-17 00:15:40 +00:00
|
|
|
}
|
2009-08-11 20:47:22 +00:00
|
|
|
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
|
2009-02-04 02:34:38 +00:00
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff,
|
|
|
|
MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2006-05-17 06:01:33 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
2009-07-03 06:47:08 +00:00
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
2006-05-16 22:56:08 +00:00
|
|
|
}
|
|
|
|
} else {
|
2006-05-17 00:15:40 +00:00
|
|
|
// If we have any FPRs remaining, we may also have GPRs remaining.
|
|
|
|
// Args passed in FPRs consume either 1 (f32) or 2 (f64) available
|
|
|
|
// GPRs.
|
2009-07-03 06:47:08 +00:00
|
|
|
if (GPR_idx != NumGPRs)
|
|
|
|
++GPR_idx;
|
2009-08-11 20:47:22 +00:00
|
|
|
if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
|
2009-07-03 06:47:08 +00:00
|
|
|
!isPPC64) // PPC64 has 64-bit GPR's obviously :)
|
|
|
|
++GPR_idx;
|
2006-05-16 22:56:08 +00:00
|
|
|
}
|
2012-10-23 15:51:16 +00:00
|
|
|
} else
|
2008-04-30 09:16:33 +00:00
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
isPPC64, isTailCall, false, MemOpChains,
|
2009-02-04 20:06:27 +00:00
|
|
|
TailCallArguments, dl);
|
2009-07-03 06:47:08 +00:00
|
|
|
if (isPPC64)
|
|
|
|
ArgOffset += 8;
|
|
|
|
else
|
2009-08-11 20:47:22 +00:00
|
|
|
ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
|
2006-05-17 00:15:40 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::v4f32:
|
|
|
|
case MVT::v4i32:
|
|
|
|
case MVT::v8i16:
|
|
|
|
case MVT::v16i8:
|
2008-03-12 00:22:17 +00:00
|
|
|
if (isVarArg) {
|
|
|
|
// These go aligned on the stack, or in the corresponding R registers
|
2009-02-17 22:15:04 +00:00
|
|
|
// when within range. The Darwin PPC ABI doc claims they also go in
|
2008-03-12 00:22:17 +00:00
|
|
|
// V registers; in fact gcc does this only for arguments that are
|
|
|
|
// prototyped, not for those that match the ... We do it for all
|
|
|
|
// arguments, seems to work.
|
|
|
|
while (ArgOffset % 16 !=0) {
|
|
|
|
ArgOffset += PtrByteSize;
|
|
|
|
if (GPR_idx != NumGPRs)
|
|
|
|
GPR_idx++;
|
|
|
|
}
|
|
|
|
// We could elide this store in the case where the object fits
|
|
|
|
// entirely in R registers. Maybe later.
|
2009-02-17 22:15:04 +00:00
|
|
|
PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
|
2008-03-12 00:22:17 +00:00
|
|
|
DAG.getConstant(ArgOffset, PtrVT));
|
2010-09-21 18:41:36 +00:00
|
|
|
SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff,
|
|
|
|
MachinePointerInfo(), false, false, 0);
|
2008-03-12 00:22:17 +00:00
|
|
|
MemOpChains.push_back(Store);
|
|
|
|
if (VR_idx != NumVRs) {
|
2010-11-23 03:31:01 +00:00
|
|
|
SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff,
|
2010-09-21 06:44:06 +00:00
|
|
|
MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2008-03-12 00:22:17 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
|
|
|
|
}
|
|
|
|
ArgOffset += 16;
|
|
|
|
for (unsigned i=0; i<16; i+=PtrByteSize) {
|
|
|
|
if (GPR_idx == NumGPRs)
|
|
|
|
break;
|
2009-02-04 02:34:38 +00:00
|
|
|
SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
|
2008-03-12 00:22:17 +00:00
|
|
|
DAG.getConstant(i, PtrVT));
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2008-03-12 00:22:17 +00:00
|
|
|
MemOpChains.push_back(Load.getValue(1));
|
|
|
|
RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2008-03-14 17:41:26 +00:00
|
|
|
// Non-varargs Altivec params generally go in registers, but have
|
|
|
|
// stack space allocated at the end.
|
|
|
|
if (VR_idx != NumVRs) {
|
|
|
|
// Doesn't have GPR space allocated.
|
|
|
|
RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
|
|
|
|
} else if (nAltivecParamsAtEnd==0) {
|
|
|
|
// We are emitting Altivec params in order.
|
2008-04-30 09:16:33 +00:00
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
isPPC64, isTailCall, true, MemOpChains,
|
2009-02-04 20:06:27 +00:00
|
|
|
TailCallArguments, dl);
|
2008-03-12 00:22:17 +00:00
|
|
|
ArgOffset += 16;
|
|
|
|
}
|
2006-05-17 00:15:40 +00:00
|
|
|
break;
|
2006-05-16 22:56:08 +00:00
|
|
|
}
|
|
|
|
}
|
2008-03-14 17:41:26 +00:00
|
|
|
// If all Altivec parameters fit in registers, as they usually do,
|
|
|
|
// they get stack space following the non-Altivec parameters. We
|
|
|
|
// don't track this here because nobody below needs it.
|
|
|
|
// If there are more Altivec parameters than fit in registers emit
|
|
|
|
// the stores here.
|
|
|
|
if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
|
|
|
|
unsigned j = 0;
|
|
|
|
// Offset is aligned; skip 1st 12 params which go in V registers.
|
|
|
|
ArgOffset = ((ArgOffset+15)/16)*16;
|
|
|
|
ArgOffset += 12*16;
|
|
|
|
for (unsigned i = 0; i != NumOps; ++i) {
|
2010-07-07 15:54:55 +00:00
|
|
|
SDValue Arg = OutVals[i];
|
|
|
|
EVT ArgType = Outs[i].VT;
|
2009-08-11 20:47:22 +00:00
|
|
|
if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
|
|
|
|
ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
|
2008-03-14 17:41:26 +00:00
|
|
|
if (++j > NumVRs) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PtrOff;
|
2008-04-30 09:16:33 +00:00
|
|
|
// We are emitting Altivec params in order.
|
|
|
|
LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
|
|
|
|
isPPC64, isTailCall, true, MemOpChains,
|
2009-02-04 20:06:27 +00:00
|
|
|
TailCallArguments, dl);
|
2008-03-14 17:41:26 +00:00
|
|
|
ArgOffset += 16;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-05-17 06:01:33 +00:00
|
|
|
if (!MemOpChains.empty())
|
2014-04-26 18:35:24 +00:00
|
|
|
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2010-03-09 20:15:42 +00:00
|
|
|
// On Darwin, R12 must contain the address of an indirect callee. This does
|
|
|
|
// not mean the MTCTR instruction must use R12; it's easier to model this as
|
|
|
|
// an extra parameter, so do that.
|
2010-11-23 03:31:01 +00:00
|
|
|
if (!isTailCall &&
|
2010-03-09 20:15:42 +00:00
|
|
|
!dyn_cast<GlobalAddressSDNode>(Callee) &&
|
|
|
|
!dyn_cast<ExternalSymbolSDNode>(Callee) &&
|
|
|
|
!isBLACompatibleAddress(Callee, DAG))
|
|
|
|
RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
|
|
|
|
PPC::R12), Callee));
|
|
|
|
|
2006-05-17 06:01:33 +00:00
|
|
|
// Build a sequence of copy-to-reg nodes chained together with token chain
|
|
|
|
// and flag operands which copy the outgoing args into the appropriate regs.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue InFlag;
|
2006-05-17 06:01:33 +00:00
|
|
|
for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
|
2009-02-17 22:15:04 +00:00
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
|
2009-02-04 02:34:38 +00:00
|
|
|
RegsToPass[i].second, InFlag);
|
2006-05-17 06:01:33 +00:00
|
|
|
InFlag = Chain.getValue(1);
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2010-11-14 23:42:06 +00:00
|
|
|
if (isTailCall)
|
2009-07-03 06:47:08 +00:00
|
|
|
PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
|
|
|
|
FPOp, true, TailCallArguments);
|
2008-03-19 21:39:28 +00:00
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
|
|
|
|
RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
|
|
|
|
Ins, InVals);
|
2006-05-16 22:56:08 +00:00
|
|
|
}
|
|
|
|
|
2011-10-14 19:51:36 +00:00
|
|
|
bool
|
|
|
|
PPCTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
|
|
|
|
MachineFunction &MF, bool isVarArg,
|
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
|
|
|
LLVMContext &Context) const {
|
|
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
|
|
|
CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(),
|
|
|
|
RVLocs, Context);
|
|
|
|
return CCInfo.CheckReturn(Outs, RetCC_PPC);
|
|
|
|
}
|
|
|
|
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
SDValue
|
|
|
|
PPCTargetLowering::LowerReturn(SDValue Chain,
|
2009-09-02 08:44:58 +00:00
|
|
|
CallingConv::ID CallConv, bool isVarArg,
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
const SmallVectorImpl<ISD::OutputArg> &Outs,
|
2010-07-07 15:54:55 +00:00
|
|
|
const SmallVectorImpl<SDValue> &OutVals,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, SelectionDAG &DAG) const {
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
|
2007-03-06 00:59:59 +00:00
|
|
|
SmallVector<CCValAssign, 16> RVLocs;
|
2011-06-08 23:55:35 +00:00
|
|
|
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
|
2012-04-19 15:16:31 +00:00
|
|
|
getTargetMachine(), RVLocs, *DAG.getContext());
|
Major calling convention code refactoring.
Instead of awkwardly encoding calling-convention information with ISD::CALL,
ISD::FORMAL_ARGUMENTS, ISD::RET, and ISD::ARG_FLAGS nodes, TargetLowering
provides three virtual functions for targets to override:
LowerFormalArguments, LowerCall, and LowerRet, which replace the custom
lowering done on the special nodes. They provide the same information, but
in a more immediately usable format.
This also reworks much of the target-independent tail call logic. The
decision of whether or not to perform a tail call is now cleanly split
between target-independent portions, and the target dependent portion
in IsEligibleForTailCallOptimization.
This also synchronizes all in-tree targets, to help enable future
refactoring and feature work.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@78142 91177308-0d34-0410-b5e6-96231b3b80d8
2009-08-05 01:29:28 +00:00
|
|
|
CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Flag;
|
2013-02-05 18:12:00 +00:00
|
|
|
SmallVector<SDValue, 4> RetOps(1, Chain);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-03-06 00:59:59 +00:00
|
|
|
// Copy the result values into the output registers.
|
|
|
|
for (unsigned i = 0; i != RVLocs.size(); ++i) {
|
|
|
|
CCValAssign &VA = RVLocs[i];
|
|
|
|
assert(VA.isRegLoc() && "Can only return in registers!");
|
2012-11-05 19:39:45 +00:00
|
|
|
|
|
|
|
SDValue Arg = OutVals[i];
|
|
|
|
|
|
|
|
switch (VA.getLocInfo()) {
|
|
|
|
default: llvm_unreachable("Unknown loc info!");
|
|
|
|
case CCValAssign::Full: break;
|
|
|
|
case CCValAssign::AExt:
|
|
|
|
Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::ZExt:
|
|
|
|
Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
case CCValAssign::SExt:
|
|
|
|
Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
|
2007-03-06 00:59:59 +00:00
|
|
|
Flag = Chain.getValue(1);
|
2013-02-05 18:12:00 +00:00
|
|
|
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2007-03-06 00:59:59 +00:00
|
|
|
|
2013-02-05 18:12:00 +00:00
|
|
|
RetOps[0] = Chain; // Update chain.
|
|
|
|
|
|
|
|
// Add the flag if we have it.
|
2008-08-28 21:40:38 +00:00
|
|
|
if (Flag.getNode())
|
2013-02-05 18:12:00 +00:00
|
|
|
RetOps.push_back(Flag);
|
|
|
|
|
2014-04-26 18:35:24 +00:00
|
|
|
return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
const PPCSubtarget &Subtarget) const {
|
2006-12-04 22:04:42 +00:00
|
|
|
// When we pop the dynamic allocation we need to restore the SP link.
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-12-04 22:04:42 +00:00
|
|
|
// Get the corect type for pointers.
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2006-12-04 22:04:42 +00:00
|
|
|
|
|
|
|
// Construct the stack pointer operand.
|
2009-11-24 01:09:07 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue StackPtr = DAG.getRegister(SP, PtrVT);
|
2006-12-04 22:04:42 +00:00
|
|
|
|
|
|
|
// Get the operands for the STACKRESTORE.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
SDValue SaveSP = Op.getOperand(1);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-12-04 22:04:42 +00:00
|
|
|
// Load the old link SP.
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr,
|
|
|
|
MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-12-04 22:04:42 +00:00
|
|
|
// Restore the stack pointer.
|
2009-02-04 20:06:27 +00:00
|
|
|
Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-12-04 22:04:42 +00:00
|
|
|
// Store the old link SP.
|
2010-09-21 18:41:36 +00:00
|
|
|
return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, MachinePointerInfo(),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2006-12-04 22:04:42 +00:00
|
|
|
}
|
|
|
|
|
2008-04-30 09:16:33 +00:00
|
|
|
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue
|
2008-04-30 09:16:33 +00:00
|
|
|
PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
|
2006-11-16 22:43:37 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
bool isDarwinABI = Subtarget.isDarwinABI();
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2008-04-30 09:16:33 +00:00
|
|
|
|
|
|
|
// Get current frame pointer save index. The users of this index will be
|
|
|
|
// primarily DYNALLOC instructions.
|
|
|
|
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
int RASI = FI->getReturnAddrSaveIndex();
|
|
|
|
|
|
|
|
// If the frame pointer save index hasn't been defined yet.
|
|
|
|
if (!RASI) {
|
|
|
|
// Find out what the fix offset of the frame pointer save area.
|
2011-01-10 12:39:04 +00:00
|
|
|
int LROffset = PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI);
|
2008-04-30 09:16:33 +00:00
|
|
|
// Allocate the frame index for frame pointer save area.
|
2010-07-03 00:40:23 +00:00
|
|
|
RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset, true);
|
2008-04-30 09:16:33 +00:00
|
|
|
// Save the result.
|
|
|
|
FI->setReturnAddrSaveIndex(RASI);
|
|
|
|
}
|
|
|
|
return DAG.getFrameIndex(RASI, PtrVT);
|
|
|
|
}
|
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue
|
2008-04-30 09:16:33 +00:00
|
|
|
PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
bool isDarwinABI = Subtarget.isDarwinABI();
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2006-11-16 22:43:37 +00:00
|
|
|
|
|
|
|
// Get current frame pointer save index. The users of this index will be
|
|
|
|
// primarily DYNALLOC instructions.
|
|
|
|
PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
|
|
|
|
int FPSI = FI->getFramePointerSaveIndex();
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2006-11-16 22:43:37 +00:00
|
|
|
// If the frame pointer save index hasn't been defined yet.
|
|
|
|
if (!FPSI) {
|
|
|
|
// Find out what the fix offset of the frame pointer save area.
|
2011-01-10 12:39:04 +00:00
|
|
|
int FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64,
|
2009-07-03 06:47:08 +00:00
|
|
|
isDarwinABI);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-16 22:43:37 +00:00
|
|
|
// Allocate the frame index for frame pointer save area.
|
2010-07-03 00:40:23 +00:00
|
|
|
FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset, true);
|
2006-11-16 22:43:37 +00:00
|
|
|
// Save the result.
|
2009-02-17 22:15:04 +00:00
|
|
|
FI->setFramePointerSaveIndex(FPSI);
|
2006-11-16 22:43:37 +00:00
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
return DAG.getFrameIndex(FPSI, PtrVT);
|
|
|
|
}
|
2006-11-16 22:43:37 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
|
2008-04-30 09:16:33 +00:00
|
|
|
SelectionDAG &DAG,
|
2010-04-17 15:26:15 +00:00
|
|
|
const PPCSubtarget &Subtarget) const {
|
2006-11-16 22:43:37 +00:00
|
|
|
// Get the inputs.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Chain = Op.getOperand(0);
|
|
|
|
SDValue Size = Op.getOperand(1);
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-11-16 22:43:37 +00:00
|
|
|
// Get the corect type for pointers.
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2006-11-16 22:43:37 +00:00
|
|
|
// Negate the size.
|
2009-02-06 21:50:26 +00:00
|
|
|
SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
|
2006-11-16 22:43:37 +00:00
|
|
|
DAG.getConstant(0, PtrVT), Size);
|
|
|
|
// Construct a node for the frame pointer save index.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FPSIdx = getFramePointerFrameIndex(DAG);
|
2006-11-16 22:43:37 +00:00
|
|
|
// Build a DYNALLOC node.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Ops[3] = { Chain, NegSize, FPSIdx };
|
2009-08-11 20:47:22 +00:00
|
|
|
SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
|
2014-04-26 18:35:24 +00:00
|
|
|
return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
|
2006-11-16 22:43:37 +00:00
|
|
|
}
|
|
|
|
|
2013-03-21 21:37:52 +00:00
|
|
|
SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc DL(Op);
|
2013-03-21 21:37:52 +00:00
|
|
|
return DAG.getNode(PPCISD::EH_SJLJ_SETJMP, DL,
|
|
|
|
DAG.getVTList(MVT::i32, MVT::Other),
|
|
|
|
Op.getOperand(0), Op.getOperand(1));
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue PPCTargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc DL(Op);
|
2013-03-21 21:37:52 +00:00
|
|
|
return DAG.getNode(PPCISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
|
|
|
|
Op.getOperand(0), Op.getOperand(1));
|
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
assert(Op.getValueType() == MVT::i1 &&
|
|
|
|
"Custom lowering only for i1 loads");
|
|
|
|
|
|
|
|
// First, load 8 bits into 32 bits, then truncate to 1 bit.
|
|
|
|
|
|
|
|
SDLoc dl(Op);
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(Op);
|
|
|
|
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue BasePtr = LD->getBasePtr();
|
|
|
|
MachineMemOperand *MMO = LD->getMemOperand();
|
|
|
|
|
|
|
|
SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, dl, getPointerTy(), Chain,
|
|
|
|
BasePtr, MVT::i8, MMO);
|
|
|
|
SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
|
|
|
|
|
|
|
|
SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
|
2014-04-27 19:20:57 +00:00
|
|
|
return DAG.getMergeValues(Ops, dl);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
assert(Op.getOperand(1).getValueType() == MVT::i1 &&
|
|
|
|
"Custom lowering only for i1 stores");
|
|
|
|
|
|
|
|
// First, zero extend to 32 bits, then use a truncating store to 8 bits.
|
|
|
|
|
|
|
|
SDLoc dl(Op);
|
|
|
|
StoreSDNode *ST = cast<StoreSDNode>(Op);
|
|
|
|
|
|
|
|
SDValue Chain = ST->getChain();
|
|
|
|
SDValue BasePtr = ST->getBasePtr();
|
|
|
|
SDValue Value = ST->getValue();
|
|
|
|
MachineMemOperand *MMO = ST->getMemOperand();
|
|
|
|
|
|
|
|
Value = DAG.getNode(ISD::ZERO_EXTEND, dl, getPointerTy(), Value);
|
|
|
|
return DAG.getTruncStore(Chain, dl, Value, BasePtr, MVT::i8, MMO);
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Remove this once the ANDI glue bug is fixed:
|
|
|
|
SDValue PPCTargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
|
|
|
|
assert(Op.getValueType() == MVT::i1 &&
|
|
|
|
"Custom lowering only for i1 results");
|
|
|
|
|
|
|
|
SDLoc DL(Op);
|
|
|
|
return DAG.getNode(PPCISD::ANDIo_1_GT_BIT, DL, MVT::i1,
|
|
|
|
Op.getOperand(0));
|
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
|
|
|
|
/// possible.
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
|
2006-04-14 06:01:58 +00:00
|
|
|
// Not FP? Not a fsel.
|
2008-06-06 12:08:01 +00:00
|
|
|
if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
|
|
|
|
!Op.getOperand(2).getValueType().isFloatingPoint())
|
2009-05-28 04:31:08 +00:00
|
|
|
return Op;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-04-07 22:11:09 +00:00
|
|
|
// We might be able to do better than this under some circumstances, but in
|
|
|
|
// general, fsel-based lowering of select is a finite-math-only optimization.
|
|
|
|
// For more information, see section F.3 of the 2.06 ISA specification.
|
|
|
|
if (!DAG.getTarget().Options.NoInfsFPMath ||
|
|
|
|
!DAG.getTarget().Options.NoNaNsFPMath)
|
|
|
|
return Op;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-04-07 22:11:09 +00:00
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT ResVT = Op.getValueType();
|
|
|
|
EVT CmpVT = Op.getOperand(0).getValueType();
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
|
|
|
SDValue TV = Op.getOperand(2), FV = Op.getOperand(3);
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// If the RHS of the comparison is a 0.0, we don't need to do the
|
|
|
|
// subtraction at all.
|
2013-04-07 22:11:09 +00:00
|
|
|
SDValue Sel1;
|
2006-04-14 06:01:58 +00:00
|
|
|
if (isFloatingPointZero(RHS))
|
|
|
|
switch (CC) {
|
|
|
|
default: break; // SETUO etc aren't handled by fsel.
|
2013-04-07 22:11:09 +00:00
|
|
|
case ISD::SETNE:
|
|
|
|
std::swap(TV, FV);
|
|
|
|
case ISD::SETEQ:
|
|
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
|
|
|
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
|
|
|
|
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
|
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
|
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), Sel1, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETLT:
|
|
|
|
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
|
2006-05-24 00:06:44 +00:00
|
|
|
case ISD::SETOGE:
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETGE:
|
2009-08-11 20:47:22 +00:00
|
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
2009-02-06 21:50:26 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETUGT:
|
|
|
|
case ISD::SETGT:
|
|
|
|
std::swap(TV, FV); // fsel is natively setge, swap operands for setlt
|
2006-05-24 00:06:44 +00:00
|
|
|
case ISD::SETOLE:
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETLE:
|
2009-08-11 20:47:22 +00:00
|
|
|
if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
|
2009-02-06 21:50:26 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Cmp;
|
2006-04-14 06:01:58 +00:00
|
|
|
switch (CC) {
|
|
|
|
default: break; // SETUO etc aren't handled by fsel.
|
2013-04-07 22:11:09 +00:00
|
|
|
case ISD::SETNE:
|
|
|
|
std::swap(TV, FV);
|
|
|
|
case ISD::SETEQ:
|
|
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
|
|
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
|
|
|
Sel1 = DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
|
|
|
if (Sel1.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Sel1 = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Sel1);
|
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT,
|
|
|
|
DAG.getNode(ISD::FNEG, dl, MVT::f64, Cmp), Sel1, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETULT:
|
|
|
|
case ISD::SETLT:
|
2009-02-06 21:50:26 +00:00
|
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
2013-04-07 22:11:09 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
|
2006-05-24 00:06:44 +00:00
|
|
|
case ISD::SETOGE:
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETGE:
|
2009-02-06 21:50:26 +00:00
|
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
2013-04-07 22:11:09 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETUGT:
|
|
|
|
case ISD::SETGT:
|
2009-02-06 21:50:26 +00:00
|
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
2013-04-07 22:11:09 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
|
2006-05-24 00:06:44 +00:00
|
|
|
case ISD::SETOLE:
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETLE:
|
2009-02-06 21:50:26 +00:00
|
|
|
Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits
|
|
|
|
Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
|
2013-04-07 22:11:09 +00:00
|
|
|
return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2009-05-28 04:31:08 +00:00
|
|
|
return Op;
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2007-11-28 18:44:47 +00:00
|
|
|
// FIXME: Split this code up when LegalizeDAGTypes lands.
|
2009-06-04 20:53:52 +00:00
|
|
|
SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) const {
|
2008-06-06 12:08:01 +00:00
|
|
|
assert(Op.getOperand(0).getValueType().isFloatingPoint());
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Src = Op.getOperand(0);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Src.getValueType() == MVT::f32)
|
|
|
|
Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
|
2008-07-19 16:26:02 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Tmp;
|
2013-08-15 02:33:50 +00:00
|
|
|
switch (Op.getSimpleValueType().SimpleTy) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i32:
|
2009-06-04 20:53:52 +00:00
|
|
|
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
|
2014-06-12 22:38:18 +00:00
|
|
|
(Subtarget.hasFPCVT() ? PPCISD::FCTIWUZ :
|
2013-04-01 17:52:07 +00:00
|
|
|
PPCISD::FCTIDZ),
|
2009-08-11 20:47:22 +00:00
|
|
|
dl, MVT::f64, Src);
|
2006-04-14 06:01:58 +00:00
|
|
|
break;
|
2009-08-11 20:47:22 +00:00
|
|
|
case MVT::i64:
|
2014-06-12 22:38:18 +00:00
|
|
|
assert((Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT()) &&
|
2013-04-01 18:42:58 +00:00
|
|
|
"i64 FP_TO_UINT is supported only with FPCVT");
|
2013-04-01 17:52:07 +00:00
|
|
|
Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIDZ :
|
|
|
|
PPCISD::FCTIDUZ,
|
|
|
|
dl, MVT::f64, Src);
|
2006-04-14 06:01:58 +00:00
|
|
|
break;
|
|
|
|
}
|
2008-07-19 16:26:02 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Convert the FP value to an int value through memory.
|
2014-06-12 22:38:18 +00:00
|
|
|
bool i32Stack = Op.getValueType() == MVT::i32 && Subtarget.hasSTFIWX() &&
|
|
|
|
(Op.getOpcode() == ISD::FP_TO_SINT || Subtarget.hasFPCVT());
|
2013-04-01 17:52:07 +00:00
|
|
|
SDValue FIPtr = DAG.CreateStackTemporary(i32Stack ? MVT::i32 : MVT::f64);
|
|
|
|
int FI = cast<FrameIndexSDNode>(FIPtr)->getIndex();
|
|
|
|
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(FI);
|
2008-07-19 16:26:02 +00:00
|
|
|
|
2007-10-15 20:14:52 +00:00
|
|
|
// Emit a store to the stack slot.
|
2013-04-01 17:52:07 +00:00
|
|
|
SDValue Chain;
|
|
|
|
if (i32Stack) {
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineMemOperand *MMO =
|
|
|
|
MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
|
|
|
|
SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
|
|
|
|
Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
|
2014-04-26 19:29:41 +00:00
|
|
|
DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
|
2013-04-01 17:52:07 +00:00
|
|
|
} else
|
|
|
|
Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
|
|
|
|
MPI, false, false, 0);
|
2007-10-15 20:14:52 +00:00
|
|
|
|
|
|
|
// Result is a load from the stack slot. If loading 4 bytes, make sure to
|
|
|
|
// add in a bias.
|
2013-04-01 17:52:07 +00:00
|
|
|
if (Op.getValueType() == MVT::i32 && !i32Stack) {
|
2009-02-04 20:06:27 +00:00
|
|
|
FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
|
2007-10-15 20:14:52 +00:00
|
|
|
DAG.getConstant(4, FIPtr.getValueType()));
|
2013-04-01 17:52:07 +00:00
|
|
|
MPI = MachinePointerInfo();
|
|
|
|
}
|
|
|
|
|
|
|
|
return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, MPI,
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2013-04-01 17:52:07 +00:00
|
|
|
SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-03-11 01:59:03 +00:00
|
|
|
// Don't handle ppc_fp128 here; let it be lowered to a libcall.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2008-03-11 01:59:03 +00:00
|
|
|
|
2014-03-05 22:14:00 +00:00
|
|
|
if (Op.getOperand(0).getValueType() == MVT::i1)
|
|
|
|
return DAG.getNode(ISD::SELECT, dl, Op.getValueType(), Op.getOperand(0),
|
|
|
|
DAG.getConstantFP(1.0, Op.getValueType()),
|
|
|
|
DAG.getConstantFP(0.0, Op.getValueType()));
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
assert((Op.getOpcode() == ISD::SINT_TO_FP || Subtarget.hasFPCVT()) &&
|
2013-04-01 17:52:07 +00:00
|
|
|
"UINT_TO_FP is supported only with FPCVT");
|
|
|
|
|
|
|
|
// If we have FCFIDS, then use it when converting to single-precision.
|
2013-04-02 03:29:51 +00:00
|
|
|
// Otherwise, convert to double-precision and then round.
|
2014-06-12 22:38:18 +00:00
|
|
|
unsigned FCFOp = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
|
2013-04-01 17:52:07 +00:00
|
|
|
(Op.getOpcode() == ISD::UINT_TO_FP ?
|
|
|
|
PPCISD::FCFIDUS : PPCISD::FCFIDS) :
|
|
|
|
(Op.getOpcode() == ISD::UINT_TO_FP ?
|
|
|
|
PPCISD::FCFIDU : PPCISD::FCFID);
|
2014-06-12 22:38:18 +00:00
|
|
|
MVT FCFTy = (Subtarget.hasFPCVT() && Op.getValueType() == MVT::f32) ?
|
2013-04-01 17:52:07 +00:00
|
|
|
MVT::f32 : MVT::f64;
|
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Op.getOperand(0).getValueType() == MVT::i64) {
|
2012-10-18 13:16:11 +00:00
|
|
|
SDValue SINT = Op.getOperand(0);
|
|
|
|
// When converting to single-precision, we actually need to convert
|
|
|
|
// to double-precision first and then round to single-precision.
|
|
|
|
// To avoid double-rounding effects during that operation, we have
|
|
|
|
// to prepare the input operand. Bits that might be truncated when
|
|
|
|
// converting to double-precision are replaced by a bit that won't
|
|
|
|
// be lost at this stage, but is below the single-precision rounding
|
|
|
|
// position.
|
|
|
|
//
|
|
|
|
// However, if -enable-unsafe-fp-math is in effect, accept double
|
|
|
|
// rounding to avoid the extra overhead.
|
|
|
|
if (Op.getValueType() == MVT::f32 &&
|
2014-06-12 22:38:18 +00:00
|
|
|
!Subtarget.hasFPCVT() &&
|
2012-10-18 13:16:11 +00:00
|
|
|
!DAG.getTarget().Options.UnsafeFPMath) {
|
|
|
|
|
|
|
|
// Twiddle input to make sure the low 11 bits are zero. (If this
|
|
|
|
// is the case, we are guaranteed the value will fit into the 53 bit
|
|
|
|
// mantissa of an IEEE double-precision value without rounding.)
|
|
|
|
// If any of those low 11 bits were not zero originally, make sure
|
|
|
|
// bit 12 (value 2048) is set instead, so that the final rounding
|
|
|
|
// to single-precision gets the correct result.
|
|
|
|
SDValue Round = DAG.getNode(ISD::AND, dl, MVT::i64,
|
|
|
|
SINT, DAG.getConstant(2047, MVT::i64));
|
|
|
|
Round = DAG.getNode(ISD::ADD, dl, MVT::i64,
|
|
|
|
Round, DAG.getConstant(2047, MVT::i64));
|
|
|
|
Round = DAG.getNode(ISD::OR, dl, MVT::i64, Round, SINT);
|
|
|
|
Round = DAG.getNode(ISD::AND, dl, MVT::i64,
|
|
|
|
Round, DAG.getConstant(-2048, MVT::i64));
|
|
|
|
|
|
|
|
// However, we cannot use that value unconditionally: if the magnitude
|
|
|
|
// of the input value is small, the bit-twiddling we did above might
|
|
|
|
// end up visibly changing the output. Fortunately, in that case, we
|
|
|
|
// don't need to twiddle bits since the original input will convert
|
|
|
|
// exactly to double-precision floating-point already. Therefore,
|
|
|
|
// construct a conditional to use the original value if the top 11
|
|
|
|
// bits are all sign-bit copies, and use the rounded value computed
|
|
|
|
// above otherwise.
|
|
|
|
SDValue Cond = DAG.getNode(ISD::SRA, dl, MVT::i64,
|
|
|
|
SINT, DAG.getConstant(53, MVT::i32));
|
|
|
|
Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
|
|
|
|
Cond, DAG.getConstant(1, MVT::i64));
|
|
|
|
Cond = DAG.getSetCC(dl, MVT::i32,
|
|
|
|
Cond, DAG.getConstant(1, MVT::i64), ISD::SETUGT);
|
|
|
|
|
|
|
|
SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
|
|
|
|
}
|
2013-04-01 17:52:07 +00:00
|
|
|
|
2012-10-18 13:16:11 +00:00
|
|
|
SDValue Bits = DAG.getNode(ISD::BITCAST, dl, MVT::f64, SINT);
|
2013-04-01 17:52:07 +00:00
|
|
|
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Bits);
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
|
2009-02-17 22:15:04 +00:00
|
|
|
FP = DAG.getNode(ISD::FP_ROUND, dl,
|
2009-08-11 20:47:22 +00:00
|
|
|
MVT::f32, FP, DAG.getIntPtrConstant(0));
|
2006-04-14 06:01:58 +00:00
|
|
|
return FP;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
assert(Op.getOperand(0).getValueType() == MVT::i32 &&
|
2013-04-01 17:52:07 +00:00
|
|
|
"Unhandled INT_TO_FP type in custom expander!");
|
2006-04-14 06:01:58 +00:00
|
|
|
// Since we only generate this in 64-bit mode, we can take advantage of
|
|
|
|
// 64-bit registers. In particular, sign extend the input value into the
|
|
|
|
// 64-bit register with extsw, store the WHOLE 64-bit value into the stack
|
|
|
|
// then lfd it and fcfid it.
|
2009-09-25 20:36:54 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *FrameInfo = MF.getFrameInfo();
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-03-31 10:12:51 +00:00
|
|
|
SDValue Ld;
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasLFIWAX() || Subtarget.hasFPCVT()) {
|
2013-03-31 10:12:51 +00:00
|
|
|
int FrameIdx = FrameInfo->CreateStackObject(4, 4, false);
|
|
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-03-31 10:12:51 +00:00
|
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0), FIdx,
|
|
|
|
MachinePointerInfo::getFixedStack(FrameIdx),
|
|
|
|
false, false, 0);
|
2013-03-31 01:58:02 +00:00
|
|
|
|
2013-03-31 10:12:51 +00:00
|
|
|
assert(cast<StoreSDNode>(Store)->getMemoryVT() == MVT::i32 &&
|
|
|
|
"Expected an i32 store");
|
|
|
|
MachineMemOperand *MMO =
|
|
|
|
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
|
|
|
|
MachineMemOperand::MOLoad, 4, 4);
|
|
|
|
SDValue Ops[] = { Store, FIdx };
|
2013-04-01 17:52:07 +00:00
|
|
|
Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
|
|
|
|
PPCISD::LFIWZX : PPCISD::LFIWAX,
|
|
|
|
dl, DAG.getVTList(MVT::f64, MVT::Other),
|
2014-04-26 19:29:41 +00:00
|
|
|
Ops, MVT::i32, MMO);
|
2013-03-31 10:12:51 +00:00
|
|
|
} else {
|
2014-06-12 22:38:18 +00:00
|
|
|
assert(Subtarget.isPPC64() &&
|
2013-04-01 17:52:07 +00:00
|
|
|
"i32->FP without LFIWAX supported only on PPC64");
|
|
|
|
|
2013-03-31 10:12:51 +00:00
|
|
|
int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
|
|
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
|
|
|
|
|
|
|
SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
|
|
|
|
Op.getOperand(0));
|
|
|
|
|
|
|
|
// STD the extended value into the stack slot.
|
|
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Ext64, FIdx,
|
|
|
|
MachinePointerInfo::getFixedStack(FrameIdx),
|
|
|
|
false, false, 0);
|
|
|
|
|
|
|
|
// Load the value as a double.
|
|
|
|
Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx,
|
|
|
|
MachinePointerInfo::getFixedStack(FrameIdx),
|
|
|
|
false, false, false, 0);
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// FCFID it and return it.
|
2013-04-01 17:52:07 +00:00
|
|
|
SDValue FP = DAG.getNode(FCFOp, dl, FCFTy, Ld);
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Op.getValueType() == MVT::f32 && !Subtarget.hasFPCVT())
|
2009-08-11 20:47:22 +00:00
|
|
|
FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
|
2006-04-14 06:01:58 +00:00
|
|
|
return FP;
|
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-01-18 19:55:37 +00:00
|
|
|
/*
|
|
|
|
The rounding mode is in bits 30:31 of FPSR, and has the following
|
|
|
|
settings:
|
|
|
|
00 Round to nearest
|
|
|
|
01 Round to 0
|
|
|
|
10 Round to +inf
|
|
|
|
11 Round to -inf
|
|
|
|
|
|
|
|
FLT_ROUNDS, on the other hand, expects the following:
|
|
|
|
-1 Undefined
|
|
|
|
0 Round to 0
|
|
|
|
1 Round to nearest
|
|
|
|
2 Round to +inf
|
|
|
|
3 Round to -inf
|
|
|
|
|
|
|
|
To perform the conversion, we do:
|
|
|
|
((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
|
|
|
|
*/
|
|
|
|
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2008-01-18 19:55:37 +00:00
|
|
|
|
|
|
|
// Save FP Control Word to register
|
2013-03-07 20:33:29 +00:00
|
|
|
EVT NodeTys[] = {
|
|
|
|
MVT::f64, // return register
|
|
|
|
MVT::Glue // unused in this context
|
|
|
|
};
|
2014-04-30 07:17:30 +00:00
|
|
|
SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
|
2008-01-18 19:55:37 +00:00
|
|
|
|
|
|
|
// Save FP register to stack slot
|
2009-11-12 20:49:22 +00:00
|
|
|
int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
|
2009-02-04 20:06:27 +00:00
|
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
|
2010-09-21 18:41:36 +00:00
|
|
|
StackSlot, MachinePointerInfo(), false, false,0);
|
2008-01-18 19:55:37 +00:00
|
|
|
|
|
|
|
// Load FP Control Word from low 32 bits of stack slot.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Four = DAG.getConstant(4, PtrVT);
|
2009-02-04 20:06:27 +00:00
|
|
|
SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
|
2010-09-21 06:44:06 +00:00
|
|
|
SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2008-01-18 19:55:37 +00:00
|
|
|
|
|
|
|
// Transform as necessary
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue CWD1 =
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getNode(ISD::AND, dl, MVT::i32,
|
|
|
|
CWD, DAG.getConstant(3, MVT::i32));
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue CWD2 =
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getNode(ISD::SRL, dl, MVT::i32,
|
|
|
|
DAG.getNode(ISD::AND, dl, MVT::i32,
|
|
|
|
DAG.getNode(ISD::XOR, dl, MVT::i32,
|
|
|
|
CWD, DAG.getConstant(3, MVT::i32)),
|
|
|
|
DAG.getConstant(3, MVT::i32)),
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
2008-01-18 19:55:37 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue RetVal =
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
|
2008-01-18 19:55:37 +00:00
|
|
|
|
2008-06-06 12:08:01 +00:00
|
|
|
return DAG.getNode((VT.getSizeInBits() < 16 ?
|
2009-02-04 20:06:27 +00:00
|
|
|
ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
|
2008-01-18 19:55:37 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned BitWidth = VT.getSizeInBits();
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-03-07 20:36:53 +00:00
|
|
|
assert(Op.getNumOperands() == 3 &&
|
|
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
|
|
"Unexpected SHL!");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Legalize is no longer limited to cleverness with just constant shift amounts.
Allow it to be clever when possible and fall back to the gross code when needed.
This allows us to compile:
long long foo1(long long X, int C) {
return X << (C|32);
}
long long foo2(long long X, int C) {
return X << (C&~32);
}
to:
_foo1:
rlwinm r2, r5, 0, 27, 31
slw r3, r4, r2
li r4, 0
blr
.globl _foo2
.align 4
_foo2:
rlwinm r2, r5, 0, 27, 25
subfic r5, r2, 32
slw r3, r3, r2
srw r5, r4, r5
or r3, r3, r5
slw r4, r4, r2
blr
instead of:
_foo1:
ori r2, r5, 32
subfic r5, r2, 32
addi r6, r2, -32
srw r5, r4, r5
slw r3, r3, r2
slw r6, r4, r6
or r3, r3, r5
slw r4, r4, r2
or r3, r3, r6
blr
.globl _foo2
.align 4
_foo2:
rlwinm r2, r5, 0, 27, 25
subfic r5, r2, 32
addi r6, r2, -32
srw r5, r4, r5
slw r3, r3, r2
slw r6, r4, r6
or r3, r3, r5
slw r4, r4, r2
or r3, r3, r6
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30507 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-20 03:47:40 +00:00
|
|
|
// Expand into a bunch of logical ops. Note that these ops
|
2006-04-14 06:01:58 +00:00
|
|
|
// depend on the PPC behavior for oversized shift amounts.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Lo = Op.getOperand(0);
|
|
|
|
SDValue Hi = Op.getOperand(1);
|
|
|
|
SDValue Amt = Op.getOperand(2);
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT AmtVT = Amt.getValueType();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(BitWidth, AmtVT), Amt);
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
|
|
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
|
|
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
|
|
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(-BitWidth, AmtVT));
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
|
|
|
|
SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
|
|
|
|
SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
2014-04-27 19:20:57 +00:00
|
|
|
return DAG.getMergeValues(OutOps, dl);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned BitWidth = VT.getSizeInBits();
|
2008-03-07 20:36:53 +00:00
|
|
|
assert(Op.getNumOperands() == 3 &&
|
|
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
|
|
"Unexpected SRL!");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-03-07 20:36:53 +00:00
|
|
|
// Expand into a bunch of logical ops. Note that these ops
|
2006-04-14 06:01:58 +00:00
|
|
|
// depend on the PPC behavior for oversized shift amounts.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Lo = Op.getOperand(0);
|
|
|
|
SDValue Hi = Op.getOperand(1);
|
|
|
|
SDValue Amt = Op.getOperand(2);
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT AmtVT = Amt.getValueType();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(BitWidth, AmtVT), Amt);
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
|
|
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
|
|
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
|
|
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(-BitWidth, AmtVT));
|
2009-02-05 00:20:09 +00:00
|
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
|
|
|
|
SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
|
|
|
|
SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
2014-04-27 19:20:57 +00:00
|
|
|
return DAG.getMergeValues(OutOps, dl);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned BitWidth = VT.getSizeInBits();
|
2008-03-07 20:36:53 +00:00
|
|
|
assert(Op.getNumOperands() == 3 &&
|
|
|
|
VT == Op.getOperand(1).getValueType() &&
|
|
|
|
"Unexpected SRA!");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-03-07 20:36:53 +00:00
|
|
|
// Expand into a bunch of logical ops, followed by a select_cc.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Lo = Op.getOperand(0);
|
|
|
|
SDValue Hi = Op.getOperand(1);
|
|
|
|
SDValue Amt = Op.getOperand(2);
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT AmtVT = Amt.getValueType();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-02-04 01:48:28 +00:00
|
|
|
SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(BitWidth, AmtVT), Amt);
|
2009-02-04 01:48:28 +00:00
|
|
|
SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
|
|
|
|
SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
|
|
|
|
SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
|
|
|
|
SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
|
2008-10-30 19:28:32 +00:00
|
|
|
DAG.getConstant(-BitWidth, AmtVT));
|
2009-02-04 01:48:28 +00:00
|
|
|
SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
|
|
|
|
SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
|
|
|
|
SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
|
2008-10-30 19:28:32 +00:00
|
|
|
Tmp4, Tmp6, ISD::SETLE);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue OutOps[] = { OutLo, OutHi };
|
2014-04-27 19:20:57 +00:00
|
|
|
return DAG.getMergeValues(OutOps, dl);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Vector related lowering.
|
|
|
|
//
|
|
|
|
|
2006-04-17 06:00:21 +00:00
|
|
|
/// BuildSplatI - Build a canonical splati of Val with an element size of
|
|
|
|
/// SplatSize. Cast the result to VT.
|
2009-08-10 22:56:29 +00:00
|
|
|
static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
|
2013-05-25 02:42:55 +00:00
|
|
|
SelectionDAG &DAG, SDLoc dl) {
|
2006-04-17 06:00:21 +00:00
|
|
|
assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
|
2006-12-01 01:45:39 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
static const EVT VTys[] = { // canonical VT to use for each size.
|
2009-08-11 20:47:22 +00:00
|
|
|
MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
|
2006-04-17 06:00:21 +00:00
|
|
|
};
|
2006-12-01 01:45:39 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-12-01 01:45:39 +00:00
|
|
|
// Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
|
|
|
|
if (Val == -1)
|
|
|
|
SplatSize = 1;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT CanonicalVT = VTys[SplatSize-1];
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 06:00:21 +00:00
|
|
|
// Build a canonical splat for this value.
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Elt = DAG.getConstant(Val, MVT::i32);
|
2008-07-27 21:46:04 +00:00
|
|
|
SmallVector<SDValue, 8> Ops;
|
2008-06-06 12:08:01 +00:00
|
|
|
Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
|
2014-04-26 18:35:24 +00:00
|
|
|
SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
|
2006-04-17 06:00:21 +00:00
|
|
|
}
|
|
|
|
|
2013-05-24 23:00:14 +00:00
|
|
|
/// BuildIntrinsicOp - Return a unary operator intrinsic node with the
|
|
|
|
/// specified intrinsic ID.
|
|
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op,
|
2013-05-25 02:42:55 +00:00
|
|
|
SelectionDAG &DAG, SDLoc dl,
|
2013-05-24 23:00:14 +00:00
|
|
|
EVT DestVT = MVT::Other) {
|
|
|
|
if (DestVT == MVT::Other) DestVT = Op.getValueType();
|
|
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
|
|
|
DAG.getConstant(IID, MVT::i32), Op);
|
|
|
|
}
|
|
|
|
|
2006-04-18 03:24:30 +00:00
|
|
|
/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
|
2006-04-17 06:58:41 +00:00
|
|
|
/// specified intrinsic ID.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
|
2013-05-25 02:42:55 +00:00
|
|
|
SelectionDAG &DAG, SDLoc dl,
|
2009-08-11 20:47:22 +00:00
|
|
|
EVT DestVT = MVT::Other) {
|
|
|
|
if (DestVT == MVT::Other) DestVT = LHS.getValueType();
|
2009-02-06 01:31:28 +00:00
|
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getConstant(IID, MVT::i32), LHS, RHS);
|
2006-04-17 06:58:41 +00:00
|
|
|
}
|
|
|
|
|
2006-04-18 03:24:30 +00:00
|
|
|
/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
|
|
|
|
/// specified intrinsic ID.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
|
2009-02-06 01:31:28 +00:00
|
|
|
SDValue Op2, SelectionDAG &DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl, EVT DestVT = MVT::Other) {
|
2009-08-11 20:47:22 +00:00
|
|
|
if (DestVT == MVT::Other) DestVT = Op0.getValueType();
|
2009-02-06 01:31:28 +00:00
|
|
|
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
|
2006-04-18 03:24:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-04-17 17:55:10 +00:00
|
|
|
/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
|
|
|
|
/// amount. The result has the specified value type.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
|
2013-05-25 02:42:55 +00:00
|
|
|
EVT VT, SelectionDAG &DAG, SDLoc dl) {
|
2006-04-17 17:55:10 +00:00
|
|
|
// Force LHS/RHS to be the right type.
|
2010-11-23 03:31:01 +00:00
|
|
|
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, LHS);
|
|
|
|
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, RHS);
|
2008-07-21 10:20:31 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
int Ops[16];
|
2006-04-17 17:55:10 +00:00
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
2009-04-27 18:41:29 +00:00
|
|
|
Ops[i] = i + Amt;
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, VT, T);
|
2006-04-17 17:55:10 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
// If this is a case we can't handle, return null and let the default
|
|
|
|
// expansion code take care of it. If we CAN select this case, and if it
|
|
|
|
// selects to a single instruction, return Op. Otherwise, if we can codegen
|
|
|
|
// this case more efficiently than a constant pool load, lower it to the
|
|
|
|
// sequence of ops that should be used.
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-03-01 01:13:55 +00:00
|
|
|
BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
|
2014-04-28 04:05:08 +00:00
|
|
|
assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
|
2009-02-25 03:12:50 +00:00
|
|
|
|
2009-03-02 23:24:16 +00:00
|
|
|
// Check if this is a splat of a constant value.
|
|
|
|
APInt APSplatBits, APSplatUndef;
|
|
|
|
unsigned SplatBitSize;
|
2009-03-01 01:13:55 +00:00
|
|
|
bool HasAnyUndefs;
|
2009-03-03 19:26:27 +00:00
|
|
|
if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
|
2009-11-13 01:45:18 +00:00
|
|
|
HasAnyUndefs, 0, true) || SplatBitSize > 32)
|
2009-03-03 19:26:27 +00:00
|
|
|
return SDValue();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
unsigned SplatBits = APSplatBits.getZExtValue();
|
|
|
|
unsigned SplatUndef = APSplatUndef.getZExtValue();
|
|
|
|
unsigned SplatSize = SplatBitSize / 8;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// First, handle single instruction cases.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// All zeros?
|
|
|
|
if (SplatBits == 0) {
|
|
|
|
// Canonicalize all zero vectors to be v4i32.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
|
|
|
|
SDValue Z = DAG.getConstant(0, MVT::i32);
|
|
|
|
Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
|
2010-11-23 03:31:01 +00:00
|
|
|
Op = DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Z);
|
2006-04-17 06:00:21 +00:00
|
|
|
}
|
2009-03-03 19:26:27 +00:00
|
|
|
return Op;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
|
|
|
|
int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
|
|
|
|
(32-SplatBitSize));
|
|
|
|
if (SextVal >= -16 && SextVal <= 15)
|
|
|
|
return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// Two instruction sequences.
|
2006-04-17 06:58:41 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// If this value is in the range [-32,30] and is even, use:
|
2013-02-20 20:41:42 +00:00
|
|
|
// VSPLTI[bhw](val/2) + VSPLTI[bhw](val/2)
|
|
|
|
// If this value is in the range [17,31] and is odd, use:
|
|
|
|
// VSPLTI[bhw](val-16) - VSPLTI[bhw](-16)
|
|
|
|
// If this value is in the range [-31,-17] and is odd, use:
|
|
|
|
// VSPLTI[bhw](val+16) + VSPLTI[bhw](-16)
|
|
|
|
// Note the last two are three-instruction sequences.
|
|
|
|
if (SextVal >= -32 && SextVal <= 31) {
|
|
|
|
// To avoid having these optimizations undone by constant folding,
|
|
|
|
// we convert to a pseudo that will be expanded later into one of
|
|
|
|
// the above forms.
|
|
|
|
SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
|
[PATCH] Correct type used for VADD_SPLAT optimization on PowerPC
In PPCISelLowering.cpp: PPCTargetLowering::LowerBUILD_VECTOR(), there
is an optimization for certain patterns to generate one or two vector
splats followed by a vector add or subtract. This operation is
represented by a VADD_SPLAT in the selection DAG. Prior to this
patch, it was possible for the VADD_SPLAT to be assigned the wrong
data type, causing incorrect code generation. This patch corrects the
problem.
Specifically, the code previously assigned the value type of the
BUILD_VECTOR node to the newly generated VADD_SPLAT node. This is
correct much of the time, but not always. The problem is that the
call to isConstantSplat() may return a SplatBitSize that is not the
same as the number of bits in the original element vector type. The
correct type to assign is a vector type with the same element bit size
as SplatBitSize.
The included test case shows an example of this, where the
BUILD_VECTOR node has a type of v16i8. The vector to be built is {0,
16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16}. isConstantSplat
detects that we can generate a splat of 16 for type v8i16, which is
the type we must assign to the VADD_SPLAT node. If we do not, we
generate a vspltisb of 8 and a vaddubm, which generates the incorrect
result {16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16,
16}. The correct code generation is a vspltish of 8 and a vadduhm.
This patch also corrected code generation for
CodeGen/PowerPC/2008-07-10-SplatMiscompile.ll, which had been marked
as an XFAIL, so we can remove the XFAIL from the test case.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@209662 91177308-0d34-0410-b5e6-96231b3b80d8
2014-05-27 15:57:51 +00:00
|
|
|
EVT VT = (SplatSize == 1 ? MVT::v16i8 :
|
|
|
|
(SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
|
|
|
|
SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32);
|
|
|
|
SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
|
|
|
|
if (VT == Op.getValueType())
|
|
|
|
return RetVal;
|
|
|
|
else
|
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
|
2009-03-03 19:26:27 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// If this is 0x8000_0000 x 4, turn into vspltisw + vslw. If it is
|
|
|
|
// 0x7FFF_FFFF x 4, turn it into not(0x8000_0000). This is important
|
|
|
|
// for fneg/fabs.
|
|
|
|
if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
|
|
|
|
// Make -1 and vspltisw -1:
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// Make the VSLW intrinsic, computing 0x8000_0000.
|
|
|
|
SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
|
|
|
|
OnesV, DAG, dl);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// xor by OnesV to invert it.
|
2009-08-11 20:47:22 +00:00
|
|
|
Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
2009-03-03 19:26:27 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-06 14:06:26 +00:00
|
|
|
// The remaining cases assume either big endian element order or
|
|
|
|
// a splat-size that equates to the element size of the vector
|
|
|
|
// to be built. An example that doesn't work for little endian is
|
|
|
|
// {0, -1, 0, -1, 0, -1, 0, -1} which has a splat size of 32 bits
|
|
|
|
// and a vector element size of 16 bits. The code below will
|
|
|
|
// produce the vector in big endian element order, which for little
|
|
|
|
// endian is {-1, 0, -1, 0, -1, 0, -1, 0}.
|
|
|
|
|
|
|
|
// For now, just avoid these optimizations in that case.
|
|
|
|
// FIXME: Develop correct optimizations for LE with mismatched
|
|
|
|
// splat and element sizes.
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isLittleEndian() &&
|
2014-06-06 14:06:26 +00:00
|
|
|
SplatSize != Op.getValueType().getVectorElementType().getSizeInBits())
|
|
|
|
return SDValue();
|
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// Check to see if this is a wide variety of vsplti*, binop self cases.
|
|
|
|
static const signed char SplatCsts[] = {
|
|
|
|
-1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
|
|
|
|
-8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
|
|
|
|
};
|
2006-04-17 17:55:10 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
|
|
|
|
// Indirect through the SplatCsts array so that we favor 'vsplti -1' for
|
|
|
|
// cases which are ambiguous (e.g. formation of 0x8000_0000). 'vsplti -1'
|
|
|
|
int i = SplatCsts[idx];
|
|
|
|
|
|
|
|
// Figure out what shift amount will be used by altivec if shifted by i in
|
|
|
|
// this splat size.
|
|
|
|
unsigned TypeShiftAmt = i & (SplatBitSize-1);
|
|
|
|
|
|
|
|
// vsplti + shl self.
|
2012-08-24 23:29:28 +00:00
|
|
|
if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
|
|
Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
|
|
|
|
Intrinsic::ppc_altivec_vslw
|
|
|
|
};
|
|
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
2009-03-03 19:26:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// vsplti + srl self.
|
|
|
|
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
|
|
Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
|
|
|
|
Intrinsic::ppc_altivec_vsrw
|
|
|
|
};
|
|
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
2009-03-03 19:26:27 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// vsplti + sra self.
|
|
|
|
if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
|
|
Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
|
|
|
|
Intrinsic::ppc_altivec_vsraw
|
|
|
|
};
|
|
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
2006-04-17 06:58:41 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// vsplti + rol self.
|
|
|
|
if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
|
|
|
|
((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
static const unsigned IIDs[] = { // Intrinsic to use for each size.
|
|
|
|
Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
|
|
|
|
Intrinsic::ppc_altivec_vrlw
|
|
|
|
};
|
|
|
|
Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Res);
|
2009-03-03 19:26:27 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-03-03 19:26:27 +00:00
|
|
|
// t = vsplti c, result = vsldoi t, t, 1
|
2012-08-24 23:29:28 +00:00
|
|
|
if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
|
|
|
|
}
|
|
|
|
// t = vsplti c, result = vsldoi t, t, 2
|
2012-08-24 23:29:28 +00:00
|
|
|
if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
|
2006-04-17 18:09:22 +00:00
|
|
|
}
|
2009-03-03 19:26:27 +00:00
|
|
|
// t = vsplti c, result = vsldoi t, t, 3
|
2012-08-24 23:29:28 +00:00
|
|
|
if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
|
2009-03-03 19:26:27 +00:00
|
|
|
return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
|
2006-04-17 06:58:41 +00:00
|
|
|
}
|
2006-04-14 05:19:18 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-04-14 05:19:18 +00:00
|
|
|
}
|
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
|
|
|
|
/// the specified operations to build the shuffle.
|
2008-07-27 21:46:04 +00:00
|
|
|
static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue RHS, SelectionDAG &DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl) {
|
2006-04-17 05:28:54 +00:00
|
|
|
unsigned OpNum = (PFEntry >> 26) & 0x0F;
|
2008-09-17 00:30:57 +00:00
|
|
|
unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
|
2006-04-17 05:28:54 +00:00
|
|
|
unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
enum {
|
2006-05-16 04:20:24 +00:00
|
|
|
OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
|
2006-04-17 05:28:54 +00:00
|
|
|
OP_VMRGHW,
|
|
|
|
OP_VMRGLW,
|
|
|
|
OP_VSPLTISW0,
|
|
|
|
OP_VSPLTISW1,
|
|
|
|
OP_VSPLTISW2,
|
|
|
|
OP_VSPLTISW3,
|
|
|
|
OP_VSLDOI4,
|
|
|
|
OP_VSLDOI8,
|
2006-05-24 17:04:05 +00:00
|
|
|
OP_VSLDOI12
|
2006-04-17 05:28:54 +00:00
|
|
|
};
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
if (OpNum == OP_COPY) {
|
|
|
|
if (LHSID == (1*9+2)*9+3) return LHS;
|
|
|
|
assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
|
|
|
|
return RHS;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue OpLHS, OpRHS;
|
2009-02-06 01:31:28 +00:00
|
|
|
OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
|
|
|
|
OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
int ShufIdxs[16];
|
2006-04-17 05:28:54 +00:00
|
|
|
switch (OpNum) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unknown i32 permute!");
|
2006-04-17 05:28:54 +00:00
|
|
|
case OP_VMRGHW:
|
|
|
|
ShufIdxs[ 0] = 0; ShufIdxs[ 1] = 1; ShufIdxs[ 2] = 2; ShufIdxs[ 3] = 3;
|
|
|
|
ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
|
|
|
|
ShufIdxs[ 8] = 4; ShufIdxs[ 9] = 5; ShufIdxs[10] = 6; ShufIdxs[11] = 7;
|
|
|
|
ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
|
|
|
|
break;
|
|
|
|
case OP_VMRGLW:
|
|
|
|
ShufIdxs[ 0] = 8; ShufIdxs[ 1] = 9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
|
|
|
|
ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
|
|
|
|
ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
|
|
|
|
ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
|
|
|
|
break;
|
|
|
|
case OP_VSPLTISW0:
|
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
|
|
ShufIdxs[i] = (i&3)+0;
|
|
|
|
break;
|
|
|
|
case OP_VSPLTISW1:
|
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
|
|
ShufIdxs[i] = (i&3)+4;
|
|
|
|
break;
|
|
|
|
case OP_VSPLTISW2:
|
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
|
|
ShufIdxs[i] = (i&3)+8;
|
|
|
|
break;
|
|
|
|
case OP_VSPLTISW3:
|
|
|
|
for (unsigned i = 0; i != 16; ++i)
|
|
|
|
ShufIdxs[i] = (i&3)+12;
|
|
|
|
break;
|
|
|
|
case OP_VSLDOI4:
|
2009-02-06 01:31:28 +00:00
|
|
|
return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
|
2006-04-17 05:28:54 +00:00
|
|
|
case OP_VSLDOI8:
|
2009-02-06 01:31:28 +00:00
|
|
|
return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
|
2006-04-17 05:28:54 +00:00
|
|
|
case OP_VSLDOI12:
|
2009-02-06 01:31:28 +00:00
|
|
|
return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
|
2006-04-17 05:28:54 +00:00
|
|
|
}
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = OpLHS.getValueType();
|
2010-11-23 03:31:01 +00:00
|
|
|
OpLHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLHS);
|
|
|
|
OpRHS = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpRHS);
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, VT, T);
|
2006-04-17 05:28:54 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE. If this
|
|
|
|
/// is a shuffle we can handle in a single instruction, return it. Otherwise,
|
|
|
|
/// return the code it can be lowered into. Worst case, it can always be
|
|
|
|
/// lowered into a vperm.
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue V1 = Op.getOperand(0);
|
|
|
|
SDValue V2 = Op.getOperand(1);
|
2009-04-27 18:41:29 +00:00
|
|
|
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT VT = Op.getValueType();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
// Cases that are handled by instructions that take permute immediates
|
|
|
|
// (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
|
|
|
|
// selected by the instruction selector.
|
|
|
|
if (V2.getOpcode() == ISD::UNDEF) {
|
2009-04-27 18:41:29 +00:00
|
|
|
if (PPC::isSplatShuffleMask(SVOp, 1) ||
|
|
|
|
PPC::isSplatShuffleMask(SVOp, 2) ||
|
|
|
|
PPC::isSplatShuffleMask(SVOp, 4) ||
|
2014-06-10 14:35:01 +00:00
|
|
|
PPC::isVPKUWUMShuffleMask(SVOp, true, DAG) ||
|
|
|
|
PPC::isVPKUHUMShuffleMask(SVOp, true, DAG) ||
|
|
|
|
PPC::isVSLDOIShuffleMask(SVOp, true, DAG) != -1 ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 1, true, DAG) ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 2, true, DAG) ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 4, true, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 1, true, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 2, true, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 4, true, DAG)) {
|
2006-04-14 05:19:18 +00:00
|
|
|
return Op;
|
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
// Altivec has a variety of "shuffle immediates" that take two vector inputs
|
|
|
|
// and produce a fixed permutation. If any of these match, do not lower to
|
|
|
|
// VPERM.
|
2014-06-10 14:35:01 +00:00
|
|
|
if (PPC::isVPKUWUMShuffleMask(SVOp, false, DAG) ||
|
|
|
|
PPC::isVPKUHUMShuffleMask(SVOp, false, DAG) ||
|
|
|
|
PPC::isVSLDOIShuffleMask(SVOp, false, DAG) != -1 ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 1, false, DAG) ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 2, false, DAG) ||
|
|
|
|
PPC::isVMRGLShuffleMask(SVOp, 4, false, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 1, false, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 2, false, DAG) ||
|
|
|
|
PPC::isVMRGHShuffleMask(SVOp, 4, false, DAG))
|
2006-04-14 05:19:18 +00:00
|
|
|
return Op;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
// Check to see if this is a shuffle of 4-byte values. If so, we can use our
|
|
|
|
// perfect shuffle table to emit an optimal matching sequence.
|
2012-01-15 13:16:05 +00:00
|
|
|
ArrayRef<int> PermMask = SVOp->getMask();
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
unsigned PFIndexes[4];
|
|
|
|
bool isFourElementShuffle = true;
|
|
|
|
for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
|
|
|
|
unsigned EltNo = 8; // Start out undef.
|
|
|
|
for (unsigned j = 0; j != 4; ++j) { // Intra-element byte.
|
2009-04-27 18:41:29 +00:00
|
|
|
if (PermMask[i*4+j] < 0)
|
2006-04-17 05:28:54 +00:00
|
|
|
continue; // Undef, ignore it.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-04-27 18:41:29 +00:00
|
|
|
unsigned ByteSource = PermMask[i*4+j];
|
2006-04-17 05:28:54 +00:00
|
|
|
if ((ByteSource & 3) != j) {
|
|
|
|
isFourElementShuffle = false;
|
|
|
|
break;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
if (EltNo == 8) {
|
|
|
|
EltNo = ByteSource/4;
|
|
|
|
} else if (EltNo != ByteSource/4) {
|
|
|
|
isFourElementShuffle = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
PFIndexes[i] = EltNo;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
// If this shuffle can be expressed as a shuffle of 4-byte elements, use the
|
2006-04-17 05:28:54 +00:00
|
|
|
// perfect shuffle vector to determine if it is cost effective to do this as
|
|
|
|
// discrete instructions, or whether we should use a vperm.
|
2014-06-10 14:35:01 +00:00
|
|
|
// For now, we skip this for little endian until such time as we have a
|
|
|
|
// little-endian perfect shuffle table.
|
|
|
|
if (isFourElementShuffle && !isLittleEndian) {
|
2006-04-17 05:28:54 +00:00
|
|
|
// Compute the index in the perfect shuffle table.
|
2009-02-17 22:15:04 +00:00
|
|
|
unsigned PFTableIndex =
|
2006-04-17 05:28:54 +00:00
|
|
|
PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
|
|
|
|
unsigned Cost = (PFEntry >> 30);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-17 05:28:54 +00:00
|
|
|
// Determining when to avoid vperm is tricky. Many things affect the cost
|
|
|
|
// of vperm, particularly how many times the perm mask needs to be computed.
|
|
|
|
// For example, if the perm mask can be hoisted out of a loop or is already
|
|
|
|
// used (perhaps because there are multiple permutes with the same shuffle
|
|
|
|
// mask?) the vperm has a cost of 1. OTOH, hoisting the permute mask out of
|
|
|
|
// the loop requires an extra register.
|
|
|
|
//
|
|
|
|
// As a compromise, we only emit discrete instructions if the shuffle can be
|
2009-02-17 22:15:04 +00:00
|
|
|
// generated in 3 or fewer operations. When we have loop information
|
2006-04-17 05:28:54 +00:00
|
|
|
// available, if this block is within a loop, we should avoid using vperm
|
|
|
|
// for 3-operation perms and use a constant pool load instead.
|
2009-02-17 22:15:04 +00:00
|
|
|
if (Cost < 3)
|
2009-02-06 01:31:28 +00:00
|
|
|
return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
|
2006-04-17 05:28:54 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
// Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
|
|
|
|
// vector that will get spilled to the constant pool.
|
|
|
|
if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
// The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
|
|
|
|
// that it is in input element units, not in bytes. Convert now.
|
2014-06-06 14:06:26 +00:00
|
|
|
|
|
|
|
// For little endian, the order of the input vectors is reversed, and
|
|
|
|
// the permutation mask is complemented with respect to 31. This is
|
|
|
|
// necessary to produce proper semantics with the big-endian-biased vperm
|
|
|
|
// instruction.
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT EltVT = V1.getValueType().getVectorElementType();
|
2008-06-06 12:08:01 +00:00
|
|
|
unsigned BytesPerElement = EltVT.getSizeInBits()/8;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SmallVector<SDValue, 16> ResultMask;
|
2009-04-27 18:41:29 +00:00
|
|
|
for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
|
|
|
|
unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 05:19:18 +00:00
|
|
|
for (unsigned j = 0; j != BytesPerElement; ++j)
|
2014-06-06 14:06:26 +00:00
|
|
|
if (isLittleEndian)
|
|
|
|
ResultMask.push_back(DAG.getConstant(31 - (SrcElt*BytesPerElement+j),
|
|
|
|
MVT::i32));
|
|
|
|
else
|
|
|
|
ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
|
|
|
|
MVT::i32));
|
2006-04-14 05:19:18 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
|
2014-04-26 18:35:24 +00:00
|
|
|
ResultMask);
|
2014-06-06 14:06:26 +00:00
|
|
|
if (isLittleEndian)
|
|
|
|
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
|
|
|
|
V2, V1, VPermMask);
|
|
|
|
else
|
|
|
|
return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
|
|
|
|
V1, V2, VPermMask);
|
2006-04-14 05:19:18 +00:00
|
|
|
}
|
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
|
|
|
|
/// altivec comparison. If it is, return true and fill in Opc/isDot with
|
|
|
|
/// information about the intrinsic.
|
2008-07-27 21:46:04 +00:00
|
|
|
static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
bool &isDot) {
|
2008-09-12 16:56:44 +00:00
|
|
|
unsigned IntrinsicID =
|
|
|
|
cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
CompareOpc = -1;
|
|
|
|
isDot = false;
|
|
|
|
switch (IntrinsicID) {
|
|
|
|
default: return false;
|
|
|
|
// Comparison predicates.
|
2006-04-14 06:01:58 +00:00
|
|
|
case Intrinsic::ppc_altivec_vcmpbfp_p: CompareOpc = 966; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc = 6; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc = 70; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Normal Comparisons.
|
|
|
|
case Intrinsic::ppc_altivec_vcmpbfp: CompareOpc = 966; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpeqfp: CompareOpc = 198; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequb: CompareOpc = 6; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequh: CompareOpc = 70; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequw: CompareOpc = 134; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgefp: CompareOpc = 454; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtfp: CompareOpc = 710; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsb: CompareOpc = 774; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsh: CompareOpc = 838; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsw: CompareOpc = 902; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtub: CompareOpc = 518; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuh: CompareOpc = 582; isDot = 0; break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuw: CompareOpc = 646; isDot = 0; break;
|
2005-09-06 22:03:27 +00:00
|
|
|
}
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
|
|
|
|
/// lower, do it, otherwise return null.
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// If this is a lowered altivec predicate compare, CompareOpc is set to the
|
|
|
|
// opcode number of the comparison.
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
int CompareOpc;
|
|
|
|
bool isDot;
|
|
|
|
if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue(); // Don't custom lower most intrinsics.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// If this is a non-dot comparison, make the VCMP node and we are done.
|
2006-04-14 06:01:58 +00:00
|
|
|
if (!isDot) {
|
2009-02-05 22:07:54 +00:00
|
|
|
SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
|
2010-03-14 22:44:11 +00:00
|
|
|
Op.getOperand(1), Op.getOperand(2),
|
|
|
|
DAG.getConstant(CompareOpc, MVT::i32));
|
2010-11-23 03:31:01 +00:00
|
|
|
return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Tmp);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Create the PPCISD altivec 'dot' comparison node.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Ops[] = {
|
2006-08-11 17:18:05 +00:00
|
|
|
Op.getOperand(2), // LHS
|
|
|
|
Op.getOperand(3), // RHS
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getConstant(CompareOpc, MVT::i32)
|
2006-08-11 17:18:05 +00:00
|
|
|
};
|
2013-03-07 20:33:29 +00:00
|
|
|
EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
|
2014-04-26 18:35:24 +00:00
|
|
|
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Now that we have the comparison, emit a copy from the CR to a GPR.
|
|
|
|
// This is flagged to the above dot comparison.
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185556 91177308-0d34-0410-b5e6-96231b3b80d8
2013-07-03 17:05:42 +00:00
|
|
|
SDValue Flags = DAG.getNode(PPCISD::MFOCRF, dl, MVT::i32,
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getRegister(PPC::CR6, MVT::i32),
|
2009-02-17 22:15:04 +00:00
|
|
|
CompNode.getValue(1));
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Unpack the result based on how the target uses it.
|
|
|
|
unsigned BitNo; // Bit # of CR6.
|
|
|
|
bool InvertBit; // Invert result?
|
2008-09-12 16:56:44 +00:00
|
|
|
switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
|
2006-04-14 06:01:58 +00:00
|
|
|
default: // Can't happen, don't crash on invalid number though.
|
|
|
|
case 0: // Return the value of the EQ bit of CR6.
|
|
|
|
BitNo = 0; InvertBit = false;
|
2006-03-24 07:53:47 +00:00
|
|
|
break;
|
2006-04-14 06:01:58 +00:00
|
|
|
case 1: // Return the inverted value of the EQ bit of CR6.
|
|
|
|
BitNo = 0; InvertBit = true;
|
2005-08-31 21:09:52 +00:00
|
|
|
break;
|
2006-04-14 06:01:58 +00:00
|
|
|
case 2: // Return the value of the LT bit of CR6.
|
|
|
|
BitNo = 2; InvertBit = false;
|
|
|
|
break;
|
|
|
|
case 3: // Return the inverted value of the LT bit of CR6.
|
|
|
|
BitNo = 2; InvertBit = true;
|
Codegen
bool %test(int %X) {
%Y = seteq int %X, 13
ret bool %Y
}
as
_test:
addi r2, r3, -13
cntlzw r2, r2
srwi r3, r2, 5
blr
rather than
_test:
cmpwi cr7, r3, 13
mfcr r2
rlwinm r3, r2, 31, 31, 31
blr
This has very little effect on most code, but speeds up analyzer 23% and
mason 11%
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@25848 91177308-0d34-0410-b5e6-96231b3b80d8
2006-01-31 08:17:29 +00:00
|
|
|
break;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Shift the bit into the low position.
|
2009-08-11 20:47:22 +00:00
|
|
|
Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
|
|
|
|
DAG.getConstant(8-(3-BitNo), MVT::i32));
|
2006-04-14 06:01:58 +00:00
|
|
|
// Isolate the bit.
|
2009-08-11 20:47:22 +00:00
|
|
|
Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// If we are supposed to, toggle the bit.
|
|
|
|
if (InvertBit)
|
2009-08-11 20:47:22 +00:00
|
|
|
Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
|
|
|
|
DAG.getConstant(1, MVT::i32));
|
2006-04-14 06:01:58 +00:00
|
|
|
return Flags;
|
|
|
|
}
|
|
|
|
|
2014-03-30 13:22:59 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
|
|
|
SDLoc dl(Op);
|
|
|
|
// For v2i64 (VSX), we can pattern patch the v2i32 case (using fp <-> int
|
|
|
|
// instructions), but for smaller types, we need to first extend up to v2i32
|
|
|
|
// before doing going farther.
|
|
|
|
if (Op.getValueType() == MVT::v2i64) {
|
|
|
|
EVT ExtVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
|
|
|
|
if (ExtVT != MVT::v2i32) {
|
|
|
|
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op.getOperand(0));
|
|
|
|
Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, Op,
|
|
|
|
DAG.getValueType(EVT::getVectorVT(*DAG.getContext(),
|
|
|
|
ExtVT.getVectorElementType(), 4)));
|
|
|
|
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Op);
|
|
|
|
Op = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v2i64, Op,
|
|
|
|
DAG.getValueType(MVT::v2i32));
|
|
|
|
}
|
|
|
|
|
|
|
|
return Op;
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2006-04-14 06:01:58 +00:00
|
|
|
// Create a stack slot that is 16-byte aligned.
|
|
|
|
MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
|
2009-11-12 20:49:22 +00:00
|
|
|
int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
|
2010-05-03 22:59:34 +00:00
|
|
|
EVT PtrVT = getPointerTy();
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
// Store the input value into Value#0 of the stack slot.
|
2009-02-04 20:06:27 +00:00
|
|
|
SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
|
2010-09-21 18:41:36 +00:00
|
|
|
Op.getOperand(0), FIdx, MachinePointerInfo(),
|
2010-02-15 16:56:53 +00:00
|
|
|
false, false, 0);
|
2006-04-14 06:01:58 +00:00
|
|
|
// Load it out.
|
2010-09-21 06:44:06 +00:00
|
|
|
return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, MachinePointerInfo(),
|
2011-11-08 18:42:53 +00:00
|
|
|
false, false, false, 0);
|
2006-04-14 06:01:58 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Op.getValueType() == MVT::v4i32) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Zero = BuildSplatI( 0, 1, MVT::v4i32, DAG, dl);
|
|
|
|
SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue RHSSwap = // = vrlw RHS, 16
|
2009-02-06 01:31:28 +00:00
|
|
|
BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Lower v8i16 multiply into this code:
li r5, lo16(LCPI1_0)
lis r6, ha16(LCPI1_0)
lvx v4, r6, r5
vmulouh v5, v3, v2
vmuleuh v2, v3, v2
vperm v2, v2, v5, v4
where v4 is:
LCPI1_0: ; <16 x ubyte>
.byte 2
.byte 3
.byte 18
.byte 19
.byte 6
.byte 7
.byte 22
.byte 23
.byte 10
.byte 11
.byte 26
.byte 27
.byte 14
.byte 15
.byte 30
.byte 31
This is 5.07x faster on the G5 (measured) than lowering to scalar code +
loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27789 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 03:43:48 +00:00
|
|
|
// Shrinkify inputs to v8i16.
|
2010-11-23 03:31:01 +00:00
|
|
|
LHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, LHS);
|
|
|
|
RHS = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHS);
|
|
|
|
RHSSwap = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, RHSSwap);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Lower v8i16 multiply into this code:
li r5, lo16(LCPI1_0)
lis r6, ha16(LCPI1_0)
lvx v4, r6, r5
vmulouh v5, v3, v2
vmuleuh v2, v3, v2
vperm v2, v2, v5, v4
where v4 is:
LCPI1_0: ; <16 x ubyte>
.byte 2
.byte 3
.byte 18
.byte 19
.byte 6
.byte 7
.byte 22
.byte 23
.byte 10
.byte 11
.byte 26
.byte 27
.byte 14
.byte 15
.byte 30
.byte 31
This is 5.07x faster on the G5 (measured) than lowering to scalar code +
loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27789 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 03:43:48 +00:00
|
|
|
// Low parts multiplied together, generating 32-bit results (we ignore the
|
|
|
|
// top parts).
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
|
2009-08-11 20:47:22 +00:00
|
|
|
LHS, RHS, DAG, dl, MVT::v4i32);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
|
2009-08-11 20:47:22 +00:00
|
|
|
LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
|
Lower v8i16 multiply into this code:
li r5, lo16(LCPI1_0)
lis r6, ha16(LCPI1_0)
lvx v4, r6, r5
vmulouh v5, v3, v2
vmuleuh v2, v3, v2
vperm v2, v2, v5, v4
where v4 is:
LCPI1_0: ; <16 x ubyte>
.byte 2
.byte 3
.byte 18
.byte 19
.byte 6
.byte 7
.byte 22
.byte 23
.byte 10
.byte 11
.byte 26
.byte 27
.byte 14
.byte 15
.byte 30
.byte 31
This is 5.07x faster on the G5 (measured) than lowering to scalar code +
loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27789 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 03:43:48 +00:00
|
|
|
// Shift the high parts up 16 bits.
|
2009-02-17 22:15:04 +00:00
|
|
|
HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
|
2009-02-06 01:31:28 +00:00
|
|
|
Neg16, DAG, dl);
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
|
|
|
|
} else if (Op.getValueType() == MVT::v8i16) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
|
2006-04-18 04:28:57 +00:00
|
|
|
|
|
|
|
return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
|
2009-02-06 01:31:28 +00:00
|
|
|
LHS, RHS, Zero, DAG, dl);
|
2009-08-11 20:47:22 +00:00
|
|
|
} else if (Op.getValueType() == MVT::v16i8) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-18 03:57:35 +00:00
|
|
|
// Multiply the even 8-bit parts, producing 16-bit sums.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
|
2009-08-11 20:47:22 +00:00
|
|
|
LHS, RHS, DAG, dl, MVT::v8i16);
|
2010-11-23 03:31:01 +00:00
|
|
|
EvenParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, EvenParts);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-18 03:57:35 +00:00
|
|
|
// Multiply the odd 8-bit parts, producing 16-bit sums.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
|
2009-08-11 20:47:22 +00:00
|
|
|
LHS, RHS, DAG, dl, MVT::v8i16);
|
2010-11-23 03:31:01 +00:00
|
|
|
OddParts = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OddParts);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2014-06-09 16:06:29 +00:00
|
|
|
// Merge the results together. Because vmuleub and vmuloub are
|
|
|
|
// instructions with a big-endian bias, we must reverse the
|
|
|
|
// element numbering and reverse the meaning of "odd" and "even"
|
|
|
|
// when generating little endian code.
|
2009-04-27 18:41:29 +00:00
|
|
|
int Ops[16];
|
2006-04-18 03:57:35 +00:00
|
|
|
for (unsigned i = 0; i != 8; ++i) {
|
2014-06-09 16:06:29 +00:00
|
|
|
if (isLittleEndian) {
|
|
|
|
Ops[i*2 ] = 2*i;
|
|
|
|
Ops[i*2+1] = 2*i+16;
|
|
|
|
} else {
|
|
|
|
Ops[i*2 ] = 2*i+1;
|
|
|
|
Ops[i*2+1] = 2*i+1+16;
|
|
|
|
}
|
2006-04-18 03:57:35 +00:00
|
|
|
}
|
2014-06-09 16:06:29 +00:00
|
|
|
if (isLittleEndian)
|
|
|
|
return DAG.getVectorShuffle(MVT::v16i8, dl, OddParts, EvenParts, Ops);
|
|
|
|
else
|
|
|
|
return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
|
Lower v8i16 multiply into this code:
li r5, lo16(LCPI1_0)
lis r6, ha16(LCPI1_0)
lvx v4, r6, r5
vmulouh v5, v3, v2
vmuleuh v2, v3, v2
vperm v2, v2, v5, v4
where v4 is:
LCPI1_0: ; <16 x ubyte>
.byte 2
.byte 3
.byte 18
.byte 19
.byte 6
.byte 7
.byte 22
.byte 23
.byte 10
.byte 11
.byte 26
.byte 27
.byte 14
.byte 15
.byte 30
.byte 31
This is 5.07x faster on the G5 (measured) than lowering to scalar code +
loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27789 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 03:43:48 +00:00
|
|
|
} else {
|
2009-07-14 16:55:14 +00:00
|
|
|
llvm_unreachable("Unknown mul to lower!");
|
Lower v8i16 multiply into this code:
li r5, lo16(LCPI1_0)
lis r6, ha16(LCPI1_0)
lvx v4, r6, r5
vmulouh v5, v3, v2
vmuleuh v2, v3, v2
vperm v2, v2, v5, v4
where v4 is:
LCPI1_0: ; <16 x ubyte>
.byte 2
.byte 3
.byte 18
.byte 19
.byte 6
.byte 7
.byte 22
.byte 23
.byte 10
.byte 11
.byte 26
.byte 27
.byte 14
.byte 15
.byte 30
.byte 31
This is 5.07x faster on the G5 (measured) than lowering to scalar code +
loads/stores.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27789 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 03:43:48 +00:00
|
|
|
}
|
2006-04-18 03:24:30 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
/// LowerOperation - Provide custom lowering hooks for some operations.
|
|
|
|
///
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
2006-04-14 06:01:58 +00:00
|
|
|
switch (Op.getOpcode()) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Wasn't expecting to be able to lower this!");
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
|
2009-11-04 21:31:18 +00:00
|
|
|
case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
|
2012-06-04 17:36:38 +00:00
|
|
|
case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
|
2006-04-22 18:53:45 +00:00
|
|
|
case ISD::JumpTable: return LowerJumpTable(Op, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SETCC: return LowerSETCC(Op, DAG);
|
2011-09-06 13:37:06 +00:00
|
|
|
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
|
|
|
|
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
|
2009-02-17 22:15:04 +00:00
|
|
|
case ISD::VASTART:
|
2014-06-12 22:38:18 +00:00
|
|
|
return LowerVASTART(Op, DAG, Subtarget);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
case ISD::VAARG:
|
2014-06-12 22:38:18 +00:00
|
|
|
return LowerVAARG(Op, DAG, Subtarget);
|
2007-04-03 13:59:52 +00:00
|
|
|
|
2013-07-25 21:36:47 +00:00
|
|
|
case ISD::VACOPY:
|
2014-06-12 22:38:18 +00:00
|
|
|
return LowerVACOPY(Op, DAG, Subtarget);
|
2013-07-25 21:36:47 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
case ISD::STACKRESTORE: return LowerSTACKRESTORE(Op, DAG, Subtarget);
|
2007-02-25 05:34:32 +00:00
|
|
|
case ISD::DYNAMIC_STACKALLOC:
|
2014-06-12 22:38:18 +00:00
|
|
|
return LowerDYNAMIC_STACKALLOC(Op, DAG, Subtarget);
|
2008-04-19 01:30:48 +00:00
|
|
|
|
2013-03-21 21:37:52 +00:00
|
|
|
case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
|
|
|
|
case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case ISD::LOAD: return LowerLOAD(Op, DAG);
|
|
|
|
case ISD::STORE: return LowerSTORE(Op, DAG);
|
|
|
|
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
|
2009-06-04 20:53:52 +00:00
|
|
|
case ISD::FP_TO_UINT:
|
|
|
|
case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG,
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc(Op));
|
2013-04-01 17:52:07 +00:00
|
|
|
case ISD::UINT_TO_FP:
|
|
|
|
case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
|
2008-01-31 00:41:03 +00:00
|
|
|
case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
|
|
|
|
// Lower 64-bit shifts.
|
Legalize is no longer limited to cleverness with just constant shift amounts.
Allow it to be clever when possible and fall back to the gross code when needed.
This allows us to compile:
long long foo1(long long X, int C) {
return X << (C|32);
}
long long foo2(long long X, int C) {
return X << (C&~32);
}
to:
_foo1:
rlwinm r2, r5, 0, 27, 31
slw r3, r4, r2
li r4, 0
blr
.globl _foo2
.align 4
_foo2:
rlwinm r2, r5, 0, 27, 25
subfic r5, r2, 32
slw r3, r3, r2
srw r5, r4, r5
or r3, r3, r5
slw r4, r4, r2
blr
instead of:
_foo1:
ori r2, r5, 32
subfic r5, r2, 32
addi r6, r2, -32
srw r5, r4, r5
slw r3, r3, r2
slw r6, r4, r6
or r3, r3, r5
slw r4, r4, r2
or r3, r3, r6
blr
.globl _foo2
.align 4
_foo2:
rlwinm r2, r5, 0, 27, 25
subfic r5, r2, 32
addi r6, r2, -32
srw r5, r4, r5
slw r3, r3, r2
slw r6, r4, r6
or r3, r3, r5
slw r4, r4, r2
or r3, r3, r6
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30507 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-20 03:47:40 +00:00
|
|
|
case ISD::SHL_PARTS: return LowerSHL_PARTS(Op, DAG);
|
|
|
|
case ISD::SRL_PARTS: return LowerSRL_PARTS(Op, DAG);
|
|
|
|
case ISD::SRA_PARTS: return LowerSRA_PARTS(Op, DAG);
|
2006-04-14 06:01:58 +00:00
|
|
|
|
|
|
|
// Vector-related lowering.
|
|
|
|
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
|
|
|
|
case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
|
|
|
|
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
|
|
|
|
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
|
2014-03-30 13:22:59 +00:00
|
|
|
case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
|
2006-04-18 03:24:30 +00:00
|
|
|
case ISD::MUL: return LowerMUL(Op, DAG);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
// For counter-based loop handling.
|
|
|
|
case ISD::INTRINSIC_W_CHAIN: return SDValue();
|
|
|
|
|
2007-12-08 06:59:59 +00:00
|
|
|
// Frame & Return address.
|
|
|
|
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
|
2007-03-01 13:11:38 +00:00
|
|
|
case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
|
2005-08-31 20:23:54 +00:00
|
|
|
}
|
2005-08-26 00:52:45 +00:00
|
|
|
}
|
|
|
|
|
2008-12-01 11:39:25 +00:00
|
|
|
void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
|
|
|
|
SmallVectorImpl<SDValue>&Results,
|
2010-04-17 15:26:15 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2011-06-28 15:30:42 +00:00
|
|
|
const TargetMachine &TM = getTargetMachine();
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(N);
|
2007-11-28 18:44:47 +00:00
|
|
|
switch (N->getOpcode()) {
|
2008-10-28 15:00:32 +00:00
|
|
|
default:
|
2012-02-07 02:50:20 +00:00
|
|
|
llvm_unreachable("Do not know how to custom type legalize this operation!");
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
case ISD::INTRINSIC_W_CHAIN: {
|
|
|
|
if (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() !=
|
|
|
|
Intrinsic::ppc_is_decremented_ctr_nonzero)
|
|
|
|
break;
|
|
|
|
|
|
|
|
assert(N->getValueType(0) == MVT::i1 &&
|
|
|
|
"Unexpected result type for CTR decrement intrinsic");
|
2013-05-18 00:21:46 +00:00
|
|
|
EVT SVT = getSetCCResultType(*DAG.getContext(), N->getValueType(0));
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
SDVTList VTs = DAG.getVTList(SVT, MVT::Other);
|
|
|
|
SDValue NewInt = DAG.getNode(N->getOpcode(), dl, VTs, N->getOperand(0),
|
|
|
|
N->getOperand(1));
|
|
|
|
|
|
|
|
Results.push_back(NewInt);
|
|
|
|
Results.push_back(NewInt.getValue(1));
|
|
|
|
break;
|
|
|
|
}
|
2011-06-28 15:30:42 +00:00
|
|
|
case ISD::VAARG: {
|
|
|
|
if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
|
|
|
|
|| TM.getSubtarget<PPCSubtarget>().isPPC64())
|
|
|
|
return;
|
|
|
|
|
|
|
|
EVT VT = N->getValueType(0);
|
|
|
|
|
|
|
|
if (VT == MVT::i64) {
|
2014-06-12 22:38:18 +00:00
|
|
|
SDValue NewNode = LowerVAARG(SDValue(N, 1), DAG, Subtarget);
|
2011-06-28 15:30:42 +00:00
|
|
|
|
|
|
|
Results.push_back(NewNode);
|
|
|
|
Results.push_back(NewNode.getValue(1));
|
|
|
|
}
|
|
|
|
return;
|
|
|
|
}
|
2008-12-01 11:39:25 +00:00
|
|
|
case ISD::FP_ROUND_INREG: {
|
2009-08-11 20:47:22 +00:00
|
|
|
assert(N->getValueType(0) == MVT::ppcf128);
|
|
|
|
assert(N->getOperand(0).getValueType() == MVT::ppcf128);
|
2009-02-17 22:15:04 +00:00
|
|
|
SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
|
2009-08-11 20:47:22 +00:00
|
|
|
MVT::f64, N->getOperand(0),
|
2008-12-01 11:39:25 +00:00
|
|
|
DAG.getIntPtrConstant(0));
|
2009-02-05 22:07:54 +00:00
|
|
|
SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
|
2009-08-11 20:47:22 +00:00
|
|
|
MVT::f64, N->getOperand(0),
|
2008-12-01 11:39:25 +00:00
|
|
|
DAG.getIntPtrConstant(1));
|
|
|
|
|
2013-03-26 10:56:22 +00:00
|
|
|
// Add the two halves of the long double in round-to-zero mode.
|
|
|
|
SDValue FPreg = DAG.getNode(PPCISD::FADDRTZ, dl, MVT::f64, Lo, Hi);
|
2008-12-01 11:39:25 +00:00
|
|
|
|
|
|
|
// We know the low half is about to be thrown away, so just use something
|
|
|
|
// convenient.
|
2009-08-11 20:47:22 +00:00
|
|
|
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
|
2009-02-05 22:07:54 +00:00
|
|
|
FPreg, FPreg));
|
2008-12-01 11:39:25 +00:00
|
|
|
return;
|
|
|
|
}
|
|
|
|
case ISD::FP_TO_SINT:
|
2013-07-09 18:50:20 +00:00
|
|
|
// LowerFP_TO_INT() can only handle f32 and f64.
|
|
|
|
if (N->getOperand(0).getValueType() == MVT::ppcf128)
|
|
|
|
return;
|
2009-06-04 20:53:52 +00:00
|
|
|
Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
|
2008-12-01 11:39:25 +00:00
|
|
|
return;
|
2007-11-28 18:44:47 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Other Lowering Code
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2008-08-25 22:34:37 +00:00
|
|
|
MachineBasicBlock *
|
|
|
|
PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
|
2009-02-07 16:15:20 +00:00
|
|
|
bool is64bit, unsigned BinOpcode) const {
|
2008-08-29 18:29:46 +00:00
|
|
|
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
|
2008-08-25 22:34:37 +00:00
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
|
|
|
|
|
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
|
|
|
MachineFunction *F = BB->getParent();
|
|
|
|
MachineFunction::iterator It = BB;
|
|
|
|
++It;
|
|
|
|
|
|
|
|
unsigned dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned ptrA = MI->getOperand(1).getReg();
|
|
|
|
unsigned ptrB = MI->getOperand(2).getReg();
|
|
|
|
unsigned incr = MI->getOperand(3).getReg();
|
2009-02-13 02:27:39 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2008-08-25 22:34:37 +00:00
|
|
|
|
|
|
|
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
F->insert(It, loopMBB);
|
|
|
|
F->insert(It, exitMBB);
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
2008-08-25 22:34:37 +00:00
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
2008-08-29 18:29:46 +00:00
|
|
|
unsigned TmpReg = (!BinOpcode) ? incr :
|
|
|
|
RegInfo.createVirtualRegister(
|
2008-09-02 20:30:23 +00:00
|
|
|
is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
|
|
|
|
(const TargetRegisterClass *) &PPC::GPRCRegClass);
|
2008-08-25 22:34:37 +00:00
|
|
|
|
|
|
|
// thisMBB:
|
|
|
|
// ...
|
|
|
|
// fallthrough --> loopMBB
|
|
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
|
|
|
|
// loopMBB:
|
|
|
|
// l[wd]arx dest, ptr
|
|
|
|
// add r0, dest, incr
|
|
|
|
// st[wd]cx. r0, ptr
|
|
|
|
// bne- loopMBB
|
|
|
|
// fallthrough --> exitMBB
|
|
|
|
BB = loopMBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
|
2008-08-25 22:34:37 +00:00
|
|
|
.addReg(ptrA).addReg(ptrB);
|
2008-08-29 18:29:46 +00:00
|
|
|
if (BinOpcode)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
|
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
|
2008-08-25 22:34:37 +00:00
|
|
|
.addReg(TmpReg).addReg(ptrA).addReg(ptrB);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2009-02-17 22:15:04 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
|
2008-08-25 22:34:37 +00:00
|
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
|
|
|
|
// exitMBB:
|
|
|
|
// ...
|
|
|
|
BB = exitMBB;
|
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
2008-08-28 17:53:09 +00:00
|
|
|
MachineBasicBlock *
|
2009-02-17 22:15:04 +00:00
|
|
|
PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
|
2008-08-28 17:53:09 +00:00
|
|
|
MachineBasicBlock *BB,
|
|
|
|
bool is8bit, // operation
|
2009-02-07 16:15:20 +00:00
|
|
|
unsigned BinOpcode) const {
|
2008-08-29 18:29:46 +00:00
|
|
|
// This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
|
2008-08-28 17:53:09 +00:00
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
|
|
|
// In 64 bit mode we have to use 64 bits for addresses, even though the
|
|
|
|
// lwarx/stwcx are 32 bits. With the 32-bit atomics we can use address
|
|
|
|
// registers without caring whether they're 32 or 64, but here we're
|
|
|
|
// doing actual arithmetic on the addresses.
|
2014-06-12 22:38:18 +00:00
|
|
|
bool is64bit = Subtarget.isPPC64();
|
2013-03-21 23:45:03 +00:00
|
|
|
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
|
|
|
MachineFunction *F = BB->getParent();
|
|
|
|
MachineFunction::iterator It = BB;
|
|
|
|
++It;
|
|
|
|
|
|
|
|
unsigned dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned ptrA = MI->getOperand(1).getReg();
|
|
|
|
unsigned ptrB = MI->getOperand(2).getReg();
|
|
|
|
unsigned incr = MI->getOperand(3).getReg();
|
2009-02-13 02:27:39 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
F->insert(It, loopMBB);
|
|
|
|
F->insert(It, exitMBB);
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
2009-02-17 22:15:04 +00:00
|
|
|
const TargetRegisterClass *RC =
|
2008-09-02 20:30:23 +00:00
|
|
|
is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
|
|
|
|
(const TargetRegisterClass *) &PPC::GPRCRegClass;
|
2008-08-28 17:53:09 +00:00
|
|
|
unsigned PtrReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned MaskReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
|
2008-08-29 18:29:46 +00:00
|
|
|
unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
|
2008-08-28 17:53:09 +00:00
|
|
|
unsigned Ptr1Reg;
|
2008-08-29 18:29:46 +00:00
|
|
|
unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
// thisMBB:
|
|
|
|
// ...
|
|
|
|
// fallthrough --> loopMBB
|
|
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
|
|
|
|
// The 4-byte load must be aligned, while a char or short may be
|
|
|
|
// anywhere in the word. Hence all this nasty bookkeeping code.
|
|
|
|
// add ptr1, ptrA, ptrB [copy if ptrA==0]
|
|
|
|
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
|
2008-09-02 20:30:23 +00:00
|
|
|
// xori shift, shift1, 24 [16]
|
2008-08-28 17:53:09 +00:00
|
|
|
// rlwinm ptr, ptr1, 0, 0, 29
|
|
|
|
// slw incr2, incr, shift
|
|
|
|
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
|
|
|
|
// slw mask, mask2, shift
|
|
|
|
// loopMBB:
|
2008-08-30 00:08:53 +00:00
|
|
|
// lwarx tmpDest, ptr
|
2008-08-29 18:29:46 +00:00
|
|
|
// add tmp, tmpDest, incr2
|
|
|
|
// andc tmp2, tmpDest, mask
|
2008-08-28 17:53:09 +00:00
|
|
|
// and tmp3, tmp, mask
|
|
|
|
// or tmp4, tmp3, tmp2
|
2008-08-30 00:08:53 +00:00
|
|
|
// stwcx. tmp4, ptr
|
2008-08-28 17:53:09 +00:00
|
|
|
// bne- loopMBB
|
|
|
|
// fallthrough --> exitMBB
|
2008-08-29 18:29:46 +00:00
|
|
|
// srw dest, tmpDest, shift
|
2011-04-04 17:07:06 +00:00
|
|
|
if (ptrA != ZeroReg) {
|
2008-08-28 17:53:09 +00:00
|
|
|
Ptr1Reg = RegInfo.createVirtualRegister(RC);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(ptrA).addReg(ptrB);
|
|
|
|
} else {
|
|
|
|
Ptr1Reg = ptrB;
|
|
|
|
}
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
|
|
|
|
if (is64bit)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(Ptr1Reg).addImm(0).addImm(61);
|
|
|
|
else
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(incr).addReg(ShiftReg);
|
|
|
|
if (is8bit)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
|
2008-08-28 17:53:09 +00:00
|
|
|
else {
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
|
2008-08-28 17:53:09 +00:00
|
|
|
}
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(Mask2Reg).addReg(ShiftReg);
|
|
|
|
|
|
|
|
BB = loopMBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
|
2011-04-04 17:07:06 +00:00
|
|
|
.addReg(ZeroReg).addReg(PtrReg);
|
2008-08-29 18:29:46 +00:00
|
|
|
if (BinOpcode)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
|
2008-08-29 18:29:46 +00:00
|
|
|
.addReg(Incr2Reg).addReg(TmpDestReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
|
2008-08-29 18:29:46 +00:00
|
|
|
.addReg(TmpDestReg).addReg(MaskReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(TmpReg).addReg(MaskReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
|
2008-08-28 17:53:09 +00:00
|
|
|
.addReg(Tmp3Reg).addReg(Tmp2Reg);
|
2013-04-02 18:37:08 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX))
|
2011-04-04 17:07:06 +00:00
|
|
|
.addReg(Tmp4Reg).addReg(ZeroReg).addReg(PtrReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2009-02-17 22:15:04 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
|
2008-08-28 17:53:09 +00:00
|
|
|
BB->addSuccessor(loopMBB);
|
|
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
|
|
|
|
// exitMBB:
|
|
|
|
// ...
|
|
|
|
BB = exitMBB;
|
2011-04-04 17:57:29 +00:00
|
|
|
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg)
|
|
|
|
.addReg(ShiftReg);
|
2008-08-28 17:53:09 +00:00
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
2013-03-21 21:37:52 +00:00
|
|
|
llvm::MachineBasicBlock*
|
|
|
|
PPCTargetLowering::emitEHSjLjSetJmp(MachineInstr *MI,
|
|
|
|
MachineBasicBlock *MBB) const {
|
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
|
|
|
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
const BasicBlock *BB = MBB->getBasicBlock();
|
|
|
|
MachineFunction::iterator I = MBB;
|
|
|
|
++I;
|
|
|
|
|
|
|
|
// Memory Reference
|
|
|
|
MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
|
|
|
|
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
|
|
|
|
|
|
|
|
unsigned DstReg = MI->getOperand(0).getReg();
|
|
|
|
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
|
|
|
|
assert(RC->hasType(MVT::i32) && "Invalid destination!");
|
|
|
|
unsigned mainDstReg = MRI.createVirtualRegister(RC);
|
|
|
|
unsigned restoreDstReg = MRI.createVirtualRegister(RC);
|
|
|
|
|
|
|
|
MVT PVT = getPointerTy();
|
|
|
|
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
|
|
|
|
"Invalid Pointer Size!");
|
|
|
|
// For v = setjmp(buf), we generate
|
|
|
|
//
|
|
|
|
// thisMBB:
|
|
|
|
// SjLjSetup mainMBB
|
|
|
|
// bl mainMBB
|
|
|
|
// v_restore = 1
|
|
|
|
// b sinkMBB
|
|
|
|
//
|
|
|
|
// mainMBB:
|
|
|
|
// buf[LabelOffset] = LR
|
|
|
|
// v_main = 0
|
|
|
|
//
|
|
|
|
// sinkMBB:
|
|
|
|
// v = phi(main, restore)
|
|
|
|
//
|
|
|
|
|
|
|
|
MachineBasicBlock *thisMBB = MBB;
|
|
|
|
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
|
|
|
|
MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
|
|
|
|
MF->insert(I, mainMBB);
|
|
|
|
MF->insert(I, sinkMBB);
|
|
|
|
|
|
|
|
MachineInstrBuilder MIB;
|
|
|
|
|
|
|
|
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
|
|
|
sinkMBB->splice(sinkMBB->begin(), MBB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), MBB->end());
|
2013-03-21 21:37:52 +00:00
|
|
|
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
|
|
|
|
|
|
|
|
// Note that the structure of the jmp_buf used here is not compatible
|
|
|
|
// with that used by libc, and is not designed to be. Specifically, it
|
|
|
|
// stores only those 'reserved' registers that LLVM does not otherwise
|
|
|
|
// understand how to spill. Also, by convention, by the time this
|
|
|
|
// intrinsic is called, Clang has already stored the frame address in the
|
|
|
|
// first slot of the buffer and stack address in the third. Following the
|
|
|
|
// X86 target code, we'll store the jump address in the second slot. We also
|
|
|
|
// need to save the TOC pointer (R2) to handle jumps between shared
|
|
|
|
// libraries, and that will be stored in the fourth slot. The thread
|
|
|
|
// identifier (R13) is not affected.
|
|
|
|
|
|
|
|
// thisMBB:
|
|
|
|
const int64_t LabelOffset = 1 * PVT.getStoreSize();
|
|
|
|
const int64_t TOCOffset = 3 * PVT.getStoreSize();
|
2013-07-17 23:50:51 +00:00
|
|
|
const int64_t BPOffset = 4 * PVT.getStoreSize();
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
// Prepare IP either in reg.
|
|
|
|
const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
|
|
|
|
unsigned LabelReg = MRI.createVirtualRegister(PtrRC);
|
|
|
|
unsigned BufReg = MI->getOperand(1).getReg();
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isPPC64() && Subtarget.isSVR4ABI()) {
|
2013-03-21 21:37:52 +00:00
|
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::STD))
|
|
|
|
.addReg(PPC::X2)
|
2013-05-16 17:58:02 +00:00
|
|
|
.addImm(TOCOffset)
|
2013-03-21 21:37:52 +00:00
|
|
|
.addReg(BufReg);
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
}
|
|
|
|
|
2013-07-17 23:50:51 +00:00
|
|
|
// Naked functions never have a base pointer, and so we use r1. For all
|
|
|
|
// other functions, this decision must be delayed until during PEI.
|
|
|
|
unsigned BaseReg;
|
|
|
|
if (MF->getFunction()->getAttributes().hasAttribute(
|
|
|
|
AttributeSet::FunctionIndex, Attribute::Naked))
|
2014-06-12 22:38:18 +00:00
|
|
|
BaseReg = Subtarget.isPPC64() ? PPC::X1 : PPC::R1;
|
2013-07-17 23:50:51 +00:00
|
|
|
else
|
2014-06-12 22:38:18 +00:00
|
|
|
BaseReg = Subtarget.isPPC64() ? PPC::BP8 : PPC::BP;
|
2013-07-17 23:50:51 +00:00
|
|
|
|
|
|
|
MIB = BuildMI(*thisMBB, MI, DL,
|
2014-06-12 22:38:18 +00:00
|
|
|
TII->get(Subtarget.isPPC64() ? PPC::STD : PPC::STW))
|
2013-07-17 23:50:51 +00:00
|
|
|
.addReg(BaseReg)
|
|
|
|
.addImm(BPOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
|
2013-03-21 21:37:52 +00:00
|
|
|
// Setup
|
2013-04-04 22:55:54 +00:00
|
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::BCLalways)).addMBB(mainMBB);
|
2013-06-07 07:55:53 +00:00
|
|
|
const PPCRegisterInfo *TRI =
|
|
|
|
static_cast<const PPCRegisterInfo*>(getTargetMachine().getRegisterInfo());
|
|
|
|
MIB.addRegMask(TRI->getNoPreservedMask());
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
BuildMI(*thisMBB, MI, DL, TII->get(PPC::LI), restoreDstReg).addImm(1);
|
|
|
|
|
|
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::EH_SjLj_Setup))
|
|
|
|
.addMBB(mainMBB);
|
|
|
|
MIB = BuildMI(*thisMBB, MI, DL, TII->get(PPC::B)).addMBB(sinkMBB);
|
|
|
|
|
|
|
|
thisMBB->addSuccessor(mainMBB, /* weight */ 0);
|
|
|
|
thisMBB->addSuccessor(sinkMBB, /* weight */ 1);
|
|
|
|
|
|
|
|
// mainMBB:
|
|
|
|
// mainDstReg = 0
|
|
|
|
MIB = BuildMI(mainMBB, DL,
|
2014-06-12 22:38:18 +00:00
|
|
|
TII->get(Subtarget.isPPC64() ? PPC::MFLR8 : PPC::MFLR), LabelReg);
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
// Store IP
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.isPPC64()) {
|
2013-03-21 21:37:52 +00:00
|
|
|
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STD))
|
|
|
|
.addReg(LabelReg)
|
2013-05-16 17:58:02 +00:00
|
|
|
.addImm(LabelOffset)
|
2013-03-21 21:37:52 +00:00
|
|
|
.addReg(BufReg);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(mainMBB, DL, TII->get(PPC::STW))
|
|
|
|
.addReg(LabelReg)
|
|
|
|
.addImm(LabelOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
}
|
|
|
|
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
|
|
|
|
BuildMI(mainMBB, DL, TII->get(PPC::LI), mainDstReg).addImm(0);
|
|
|
|
mainMBB->addSuccessor(sinkMBB);
|
|
|
|
|
|
|
|
// sinkMBB:
|
|
|
|
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
|
|
|
|
TII->get(PPC::PHI), DstReg)
|
|
|
|
.addReg(mainDstReg).addMBB(mainMBB)
|
|
|
|
.addReg(restoreDstReg).addMBB(thisMBB);
|
|
|
|
|
|
|
|
MI->eraseFromParent();
|
|
|
|
return sinkMBB;
|
|
|
|
}
|
|
|
|
|
|
|
|
MachineBasicBlock *
|
|
|
|
PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr *MI,
|
|
|
|
MachineBasicBlock *MBB) const {
|
|
|
|
DebugLoc DL = MI->getDebugLoc();
|
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
|
|
|
|
|
|
|
MachineFunction *MF = MBB->getParent();
|
|
|
|
MachineRegisterInfo &MRI = MF->getRegInfo();
|
|
|
|
|
|
|
|
// Memory Reference
|
|
|
|
MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
|
|
|
|
MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
|
|
|
|
|
|
|
|
MVT PVT = getPointerTy();
|
|
|
|
assert((PVT == MVT::i64 || PVT == MVT::i32) &&
|
|
|
|
"Invalid Pointer Size!");
|
|
|
|
|
|
|
|
const TargetRegisterClass *RC =
|
|
|
|
(PVT == MVT::i64) ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
|
|
|
|
unsigned Tmp = MRI.createVirtualRegister(RC);
|
|
|
|
// Since FP is only updated here but NOT referenced, it's treated as GPR.
|
|
|
|
unsigned FP = (PVT == MVT::i64) ? PPC::X31 : PPC::R31;
|
|
|
|
unsigned SP = (PVT == MVT::i64) ? PPC::X1 : PPC::R1;
|
2013-07-17 23:50:51 +00:00
|
|
|
unsigned BP = (PVT == MVT::i64) ? PPC::X30 : PPC::R30;
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
MachineInstrBuilder MIB;
|
|
|
|
|
|
|
|
const int64_t LabelOffset = 1 * PVT.getStoreSize();
|
|
|
|
const int64_t SPOffset = 2 * PVT.getStoreSize();
|
|
|
|
const int64_t TOCOffset = 3 * PVT.getStoreSize();
|
2013-07-17 23:50:51 +00:00
|
|
|
const int64_t BPOffset = 4 * PVT.getStoreSize();
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
unsigned BufReg = MI->getOperand(0).getReg();
|
|
|
|
|
|
|
|
// Reload FP (the jumped-to function may not have had a
|
|
|
|
// frame pointer, and if so, then its r31 will be restored
|
|
|
|
// as necessary).
|
|
|
|
if (PVT == MVT::i64) {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), FP)
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(BufReg);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), FP)
|
|
|
|
.addImm(0)
|
|
|
|
.addReg(BufReg);
|
|
|
|
}
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
|
|
|
|
// Reload IP
|
|
|
|
if (PVT == MVT::i64) {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), Tmp)
|
2013-05-16 17:58:02 +00:00
|
|
|
.addImm(LabelOffset)
|
2013-03-21 21:37:52 +00:00
|
|
|
.addReg(BufReg);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), Tmp)
|
|
|
|
.addImm(LabelOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
}
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
|
|
|
|
// Reload SP
|
|
|
|
if (PVT == MVT::i64) {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), SP)
|
2013-05-16 17:58:02 +00:00
|
|
|
.addImm(SPOffset)
|
2013-03-21 21:37:52 +00:00
|
|
|
.addReg(BufReg);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), SP)
|
|
|
|
.addImm(SPOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
}
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
|
2013-07-17 23:50:51 +00:00
|
|
|
// Reload BP
|
|
|
|
if (PVT == MVT::i64) {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), BP)
|
|
|
|
.addImm(BPOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
} else {
|
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LWZ), BP)
|
|
|
|
.addImm(BPOffset)
|
|
|
|
.addReg(BufReg);
|
|
|
|
}
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
2013-03-21 21:37:52 +00:00
|
|
|
|
|
|
|
// Reload TOC
|
2014-06-12 22:38:18 +00:00
|
|
|
if (PVT == MVT::i64 && Subtarget.isSVR4ABI()) {
|
2013-03-21 21:37:52 +00:00
|
|
|
MIB = BuildMI(*MBB, MI, DL, TII->get(PPC::LD), PPC::X2)
|
2013-05-16 17:58:02 +00:00
|
|
|
.addImm(TOCOffset)
|
2013-03-21 21:37:52 +00:00
|
|
|
.addReg(BufReg);
|
|
|
|
|
|
|
|
MIB.setMemRefs(MMOBegin, MMOEnd);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Jump
|
|
|
|
BuildMI(*MBB, MI, DL,
|
|
|
|
TII->get(PVT == MVT::i64 ? PPC::MTCTR8 : PPC::MTCTR)).addReg(Tmp);
|
|
|
|
BuildMI(*MBB, MI, DL, TII->get(PVT == MVT::i64 ? PPC::BCTR8 : PPC::BCTR));
|
|
|
|
|
|
|
|
MI->eraseFromParent();
|
|
|
|
return MBB;
|
|
|
|
}
|
|
|
|
|
2005-08-26 21:23:58 +00:00
|
|
|
MachineBasicBlock *
|
2008-01-30 18:18:23 +00:00
|
|
|
PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
|
2010-05-01 00:01:06 +00:00
|
|
|
MachineBasicBlock *BB) const {
|
2013-03-21 21:37:52 +00:00
|
|
|
if (MI->getOpcode() == PPC::EH_SjLj_SetJmp32 ||
|
|
|
|
MI->getOpcode() == PPC::EH_SjLj_SetJmp64) {
|
|
|
|
return emitEHSjLjSetJmp(MI, BB);
|
|
|
|
} else if (MI->getOpcode() == PPC::EH_SjLj_LongJmp32 ||
|
|
|
|
MI->getOpcode() == PPC::EH_SjLj_LongJmp64) {
|
|
|
|
return emitEHSjLjLongJmp(MI, BB);
|
|
|
|
}
|
|
|
|
|
2006-11-27 23:37:22 +00:00
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
2008-07-12 02:23:19 +00:00
|
|
|
|
|
|
|
// To "insert" these instructions we actually have to insert their
|
|
|
|
// control-flow patterns.
|
2005-08-26 21:23:58 +00:00
|
|
|
const BasicBlock *LLVM_BB = BB->getBasicBlock();
|
2008-07-07 23:14:23 +00:00
|
|
|
MachineFunction::iterator It = BB;
|
2005-08-26 21:23:58 +00:00
|
|
|
++It;
|
2008-07-12 02:23:19 +00:00
|
|
|
|
2008-07-07 23:14:23 +00:00
|
|
|
MachineFunction *F = BB->getParent();
|
2008-07-12 02:23:19 +00:00
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasISEL() && (MI->getOpcode() == PPC::SELECT_CC_I4 ||
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
MI->getOpcode() == PPC::SELECT_CC_I8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_I4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_I8)) {
|
2013-04-05 23:29:01 +00:00
|
|
|
SmallVector<MachineOperand, 2> Cond;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_CC_I8)
|
|
|
|
Cond.push_back(MI->getOperand(4));
|
|
|
|
else
|
|
|
|
Cond.push_back(MachineOperand::CreateImm(PPC::PRED_BIT_SET));
|
2013-04-05 23:29:01 +00:00
|
|
|
Cond.push_back(MI->getOperand(1));
|
2012-06-22 23:10:08 +00:00
|
|
|
|
2013-04-05 23:29:01 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2013-06-07 07:55:53 +00:00
|
|
|
const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
|
|
|
|
TII->insertSelect(*BB, MI, dl, MI->getOperand(0).getReg(),
|
|
|
|
Cond, MI->getOperand(2).getReg(),
|
|
|
|
MI->getOperand(3).getReg());
|
2012-06-22 23:10:08 +00:00
|
|
|
} else if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_CC_I8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_CC_F4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_CC_F8 ||
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
MI->getOpcode() == PPC::SELECT_CC_VRRC ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_I4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_I8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_F4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_F8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_VRRC) {
|
2008-07-12 02:23:19 +00:00
|
|
|
// The incoming instruction knows the destination vreg to set, the
|
|
|
|
// condition code register to branch on, the true/false values to
|
|
|
|
// select between, and a branch opcode to use.
|
|
|
|
|
|
|
|
// thisMBB:
|
|
|
|
// ...
|
|
|
|
// TrueVal = ...
|
|
|
|
// cmpTY ccX, r1, r2
|
|
|
|
// bCC copy1MBB
|
|
|
|
// fallthrough --> copy0MBB
|
|
|
|
MachineBasicBlock *thisMBB = BB;
|
|
|
|
MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
2009-02-13 02:27:39 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2008-07-12 02:23:19 +00:00
|
|
|
F->insert(It, copy0MBB);
|
|
|
|
F->insert(It, sinkMBB);
|
2010-07-06 20:24:04 +00:00
|
|
|
|
|
|
|
// Transfer the remainder of BB and its successor edges to sinkMBB.
|
|
|
|
sinkMBB->splice(sinkMBB->begin(), BB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
2010-07-06 20:24:04 +00:00
|
|
|
sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
|
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
// Next, add the true and fallthrough blocks as its successors.
|
|
|
|
BB->addSuccessor(copy0MBB);
|
|
|
|
BB->addSuccessor(sinkMBB);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (MI->getOpcode() == PPC::SELECT_I4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_I8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_F4 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_F8 ||
|
|
|
|
MI->getOpcode() == PPC::SELECT_VRRC) {
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::BC))
|
|
|
|
.addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
|
|
|
|
} else {
|
|
|
|
unsigned SelectPred = MI->getOperand(4).getImm();
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
|
|
|
.addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
|
|
|
|
}
|
2010-07-06 20:24:04 +00:00
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
// copy0MBB:
|
|
|
|
// %FalseValue = ...
|
|
|
|
// # fallthrough to sinkMBB
|
|
|
|
BB = copy0MBB;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
// Update machine-CFG edges
|
|
|
|
BB->addSuccessor(sinkMBB);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
// sinkMBB:
|
|
|
|
// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
|
|
|
|
// ...
|
|
|
|
BB = sinkMBB;
|
2010-07-06 20:24:04 +00:00
|
|
|
BuildMI(*BB, BB->begin(), dl,
|
|
|
|
TII->get(PPC::PHI), MI->getOperand(0).getReg())
|
2008-07-12 02:23:19 +00:00
|
|
|
.addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
|
|
|
|
.addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
|
|
|
|
}
|
2008-08-28 17:53:09 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::AND);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::AND8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::OR);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::OR8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::XOR);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
|
2014-07-08 16:16:02 +00:00
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::NAND);
|
2008-08-28 17:53:09 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
|
2014-07-08 16:16:02 +00:00
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::NAND);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
|
2014-07-08 16:16:02 +00:00
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::NAND);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
|
2014-07-08 16:16:02 +00:00
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::NAND8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
|
2008-08-25 22:34:37 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8);
|
2008-08-28 17:53:09 +00:00
|
|
|
|
2008-08-29 18:29:46 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
|
|
|
|
BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, false, 0);
|
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
|
|
|
|
BB = EmitAtomicBinary(MI, BB, true, 0);
|
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
|
|
|
|
MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) {
|
|
|
|
bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
|
|
|
|
|
|
|
|
unsigned dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned ptrA = MI->getOperand(1).getReg();
|
|
|
|
unsigned ptrB = MI->getOperand(2).getReg();
|
|
|
|
unsigned oldval = MI->getOperand(3).getReg();
|
|
|
|
unsigned newval = MI->getOperand(4).getReg();
|
2009-02-13 02:27:39 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2008-07-12 02:23:19 +00:00
|
|
|
|
2008-08-25 18:53:26 +00:00
|
|
|
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
2008-07-12 02:23:19 +00:00
|
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
2008-08-25 18:53:26 +00:00
|
|
|
F->insert(It, loop1MBB);
|
|
|
|
F->insert(It, loop2MBB);
|
|
|
|
F->insert(It, midMBB);
|
2008-07-12 02:23:19 +00:00
|
|
|
F->insert(It, exitMBB);
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
2008-07-12 02:23:19 +00:00
|
|
|
|
|
|
|
// thisMBB:
|
|
|
|
// ...
|
|
|
|
// fallthrough --> loopMBB
|
2008-08-25 18:53:26 +00:00
|
|
|
BB->addSuccessor(loop1MBB);
|
2008-07-12 02:23:19 +00:00
|
|
|
|
2008-08-25 18:53:26 +00:00
|
|
|
// loop1MBB:
|
2008-07-12 02:23:19 +00:00
|
|
|
// l[wd]arx dest, ptr
|
2008-08-25 18:53:26 +00:00
|
|
|
// cmp[wd] dest, oldval
|
|
|
|
// bne- midMBB
|
|
|
|
// loop2MBB:
|
2008-07-12 02:23:19 +00:00
|
|
|
// st[wd]cx. newval, ptr
|
|
|
|
// bne- loopMBB
|
2008-08-25 18:53:26 +00:00
|
|
|
// b exitBB
|
|
|
|
// midMBB:
|
|
|
|
// st[wd]cx. dest, ptr
|
|
|
|
// exitBB:
|
|
|
|
BB = loop1MBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
|
2008-07-12 02:23:19 +00:00
|
|
|
.addReg(ptrA).addReg(ptrB);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
|
2008-07-12 02:23:19 +00:00
|
|
|
.addReg(oldval).addReg(dest);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2008-08-25 18:53:26 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
|
|
|
|
BB->addSuccessor(loop2MBB);
|
|
|
|
BB->addSuccessor(midMBB);
|
|
|
|
|
|
|
|
BB = loop2MBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
|
2008-07-12 02:23:19 +00:00
|
|
|
.addReg(newval).addReg(ptrA).addReg(ptrB);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2008-08-25 18:53:26 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
|
2008-08-25 18:53:26 +00:00
|
|
|
BB->addSuccessor(loop1MBB);
|
2008-07-12 02:23:19 +00:00
|
|
|
BB->addSuccessor(exitMBB);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-08-25 18:53:26 +00:00
|
|
|
BB = midMBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
|
2008-08-25 18:53:26 +00:00
|
|
|
.addReg(dest).addReg(ptrA).addReg(ptrB);
|
|
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
|
2008-07-12 02:23:19 +00:00
|
|
|
// exitMBB:
|
|
|
|
// ...
|
|
|
|
BB = exitMBB;
|
2008-08-30 00:08:53 +00:00
|
|
|
} else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
|
|
|
|
MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
|
|
|
|
// We must use 64-bit registers for addresses when targeting 64-bit,
|
|
|
|
// since we're actually doing arithmetic on them. Other registers
|
|
|
|
// can be 32-bit.
|
2014-06-12 22:38:18 +00:00
|
|
|
bool is64bit = Subtarget.isPPC64();
|
2008-08-30 00:08:53 +00:00
|
|
|
bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
|
|
|
|
|
|
|
|
unsigned dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned ptrA = MI->getOperand(1).getReg();
|
|
|
|
unsigned ptrB = MI->getOperand(2).getReg();
|
|
|
|
unsigned oldval = MI->getOperand(3).getReg();
|
|
|
|
unsigned newval = MI->getOperand(4).getReg();
|
2009-02-13 02:27:39 +00:00
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
2008-08-30 00:08:53 +00:00
|
|
|
|
|
|
|
MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
|
|
|
|
F->insert(It, loop1MBB);
|
|
|
|
F->insert(It, loop2MBB);
|
|
|
|
F->insert(It, midMBB);
|
|
|
|
F->insert(It, exitMBB);
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->splice(exitMBB->begin(), BB,
|
2014-03-02 12:27:27 +00:00
|
|
|
std::next(MachineBasicBlock::iterator(MI)), BB->end());
|
2010-07-06 20:24:04 +00:00
|
|
|
exitMBB->transferSuccessorsAndUpdatePHIs(BB);
|
2008-08-30 00:08:53 +00:00
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
2009-02-17 22:15:04 +00:00
|
|
|
const TargetRegisterClass *RC =
|
2008-09-02 20:30:23 +00:00
|
|
|
is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
|
|
|
|
(const TargetRegisterClass *) &PPC::GPRCRegClass;
|
2008-08-30 00:08:53 +00:00
|
|
|
unsigned PtrReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned MaskReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
|
|
|
|
unsigned Ptr1Reg;
|
|
|
|
unsigned TmpReg = RegInfo.createVirtualRegister(RC);
|
2013-03-21 23:45:03 +00:00
|
|
|
unsigned ZeroReg = is64bit ? PPC::ZERO8 : PPC::ZERO;
|
2008-08-30 00:08:53 +00:00
|
|
|
// thisMBB:
|
|
|
|
// ...
|
|
|
|
// fallthrough --> loopMBB
|
|
|
|
BB->addSuccessor(loop1MBB);
|
|
|
|
|
|
|
|
// The 4-byte load must be aligned, while a char or short may be
|
|
|
|
// anywhere in the word. Hence all this nasty bookkeeping code.
|
|
|
|
// add ptr1, ptrA, ptrB [copy if ptrA==0]
|
|
|
|
// rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
|
2008-09-02 20:30:23 +00:00
|
|
|
// xori shift, shift1, 24 [16]
|
2008-08-30 00:08:53 +00:00
|
|
|
// rlwinm ptr, ptr1, 0, 0, 29
|
|
|
|
// slw newval2, newval, shift
|
|
|
|
// slw oldval2, oldval,shift
|
|
|
|
// li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
|
|
|
|
// slw mask, mask2, shift
|
|
|
|
// and newval3, newval2, mask
|
|
|
|
// and oldval3, oldval2, mask
|
|
|
|
// loop1MBB:
|
|
|
|
// lwarx tmpDest, ptr
|
|
|
|
// and tmp, tmpDest, mask
|
|
|
|
// cmpw tmp, oldval3
|
|
|
|
// bne- midMBB
|
|
|
|
// loop2MBB:
|
|
|
|
// andc tmp2, tmpDest, mask
|
|
|
|
// or tmp4, tmp2, newval3
|
|
|
|
// stwcx. tmp4, ptr
|
|
|
|
// bne- loop1MBB
|
|
|
|
// b exitBB
|
|
|
|
// midMBB:
|
|
|
|
// stwcx. tmpDest, ptr
|
|
|
|
// exitBB:
|
|
|
|
// srw dest, tmpDest, shift
|
2011-04-04 17:07:06 +00:00
|
|
|
if (ptrA != ZeroReg) {
|
2008-08-30 00:08:53 +00:00
|
|
|
Ptr1Reg = RegInfo.createVirtualRegister(RC);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(ptrA).addReg(ptrB);
|
|
|
|
} else {
|
|
|
|
Ptr1Reg = ptrB;
|
|
|
|
}
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
|
|
|
|
if (is64bit)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(Ptr1Reg).addImm(0).addImm(61);
|
|
|
|
else
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(newval).addReg(ShiftReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(oldval).addReg(ShiftReg);
|
|
|
|
if (is8bit)
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
|
2008-08-30 00:08:53 +00:00
|
|
|
else {
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
|
|
|
|
.addReg(Mask3Reg).addImm(65535);
|
2008-08-30 00:08:53 +00:00
|
|
|
}
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(Mask2Reg).addReg(ShiftReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(NewVal2Reg).addReg(MaskReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(OldVal2Reg).addReg(MaskReg);
|
|
|
|
|
|
|
|
BB = loop1MBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
|
2011-04-04 17:07:06 +00:00
|
|
|
.addReg(ZeroReg).addReg(PtrReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
|
|
|
|
.addReg(TmpDestReg).addReg(MaskReg);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
|
2008-08-30 00:08:53 +00:00
|
|
|
.addReg(TmpReg).addReg(OldVal3Reg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2008-08-30 00:08:53 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
|
|
|
|
BB->addSuccessor(loop2MBB);
|
|
|
|
BB->addSuccessor(midMBB);
|
|
|
|
|
|
|
|
BB = loop2MBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
|
|
|
|
.addReg(TmpDestReg).addReg(MaskReg);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
|
|
|
|
.addReg(Tmp2Reg).addReg(NewVal3Reg);
|
|
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
|
2011-04-04 17:07:06 +00:00
|
|
|
.addReg(ZeroReg).addReg(PtrReg);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::BCC))
|
2008-08-30 00:08:53 +00:00
|
|
|
.addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
|
2008-08-30 00:08:53 +00:00
|
|
|
BB->addSuccessor(loop1MBB);
|
|
|
|
BB->addSuccessor(exitMBB);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-08-30 00:08:53 +00:00
|
|
|
BB = midMBB;
|
2009-02-13 02:27:39 +00:00
|
|
|
BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
|
2011-04-04 17:07:06 +00:00
|
|
|
.addReg(ZeroReg).addReg(PtrReg);
|
2008-08-30 00:08:53 +00:00
|
|
|
BB->addSuccessor(exitMBB);
|
|
|
|
|
|
|
|
// exitMBB:
|
|
|
|
// ...
|
|
|
|
BB = exitMBB;
|
2011-04-04 17:57:29 +00:00
|
|
|
BuildMI(*BB, BB->begin(), dl, TII->get(PPC::SRW),dest).addReg(TmpReg)
|
|
|
|
.addReg(ShiftReg);
|
2013-03-26 10:56:22 +00:00
|
|
|
} else if (MI->getOpcode() == PPC::FADDrtz) {
|
|
|
|
// This pseudo performs an FADD with rounding mode temporarily forced
|
|
|
|
// to round-to-zero. We emit this via custom inserter since the FPSCR
|
|
|
|
// is not modeled at the SelectionDAG level.
|
|
|
|
unsigned Dest = MI->getOperand(0).getReg();
|
|
|
|
unsigned Src1 = MI->getOperand(1).getReg();
|
|
|
|
unsigned Src2 = MI->getOperand(2).getReg();
|
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
|
|
unsigned MFFSReg = RegInfo.createVirtualRegister(&PPC::F8RCRegClass);
|
|
|
|
|
|
|
|
// Save FPSCR value.
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MFFS), MFFSReg);
|
|
|
|
|
|
|
|
// Set rounding mode to round-to-zero.
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB1)).addImm(31);
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSB0)).addImm(30);
|
|
|
|
|
|
|
|
// Perform addition.
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::FADD), Dest).addReg(Src1).addReg(Src2);
|
|
|
|
|
|
|
|
// Restore FPSCR value.
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(PPC::MTFSF)).addImm(1).addReg(MFFSReg);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
} else if (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
|
|
|
|
MI->getOpcode() == PPC::ANDIo_1_GT_BIT ||
|
|
|
|
MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
|
|
|
|
MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) {
|
|
|
|
unsigned Opcode = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8 ||
|
|
|
|
MI->getOpcode() == PPC::ANDIo_1_GT_BIT8) ?
|
|
|
|
PPC::ANDIo8 : PPC::ANDIo;
|
|
|
|
bool isEQ = (MI->getOpcode() == PPC::ANDIo_1_EQ_BIT ||
|
|
|
|
MI->getOpcode() == PPC::ANDIo_1_EQ_BIT8);
|
|
|
|
|
|
|
|
MachineRegisterInfo &RegInfo = F->getRegInfo();
|
|
|
|
unsigned Dest = RegInfo.createVirtualRegister(Opcode == PPC::ANDIo ?
|
|
|
|
&PPC::GPRCRegClass :
|
|
|
|
&PPC::G8RCRegClass);
|
|
|
|
|
|
|
|
DebugLoc dl = MI->getDebugLoc();
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(Opcode), Dest)
|
|
|
|
.addReg(MI->getOperand(1).getReg()).addImm(1);
|
|
|
|
BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY),
|
|
|
|
MI->getOperand(0).getReg())
|
|
|
|
.addReg(isEQ ? PPC::CR0EQ : PPC::CR0GT);
|
2008-08-30 00:08:53 +00:00
|
|
|
} else {
|
2009-07-14 16:55:14 +00:00
|
|
|
llvm_unreachable("Unexpected instr type to insert");
|
2008-07-12 02:23:19 +00:00
|
|
|
}
|
2005-08-26 21:23:58 +00:00
|
|
|
|
2010-07-06 20:24:04 +00:00
|
|
|
MI->eraseFromParent(); // The pseudo instruction is gone now.
|
2005-08-26 21:23:58 +00:00
|
|
|
return BB;
|
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Target Optimization Hooks
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue PPCTargetLowering::DAGCombineFastRecip(SDValue Op,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2013-04-03 04:01:11 +00:00
|
|
|
if (DCI.isAfterLegalizeVectorOps())
|
|
|
|
return SDValue();
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if ((VT == MVT::f32 && Subtarget.hasFRES()) ||
|
|
|
|
(VT == MVT::f64 && Subtarget.hasFRE()) ||
|
|
|
|
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
|
|
|
|
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
|
2013-04-03 04:01:11 +00:00
|
|
|
|
|
|
|
// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
|
|
|
|
// For the reciprocal, we need to find the zero of the function:
|
|
|
|
// F(X) = A X - 1 [which has a zero at X = 1/A]
|
|
|
|
// =>
|
|
|
|
// X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
|
|
|
|
// does not require additional intermediate precision]
|
|
|
|
|
|
|
|
// Convergence is quadratic, so we essentially double the number of digits
|
|
|
|
// correct after every iteration. The minimum architected relative
|
|
|
|
// accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
|
|
|
|
// 23 digits and double has 52 digits.
|
2014-06-12 22:38:18 +00:00
|
|
|
int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
|
2013-04-03 17:44:56 +00:00
|
|
|
if (VT.getScalarType() == MVT::f64)
|
2013-04-03 04:01:11 +00:00
|
|
|
++Iterations;
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
|
|
|
|
SDValue FPOne =
|
2013-04-03 17:44:56 +00:00
|
|
|
DAG.getConstantFP(1.0, VT.getScalarType());
|
|
|
|
if (VT.isVector()) {
|
|
|
|
assert(VT.getVectorNumElements() == 4 &&
|
2013-04-03 04:01:11 +00:00
|
|
|
"Unknown vector type");
|
2013-04-03 17:44:56 +00:00
|
|
|
FPOne = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
|
2013-04-03 04:01:11 +00:00
|
|
|
FPOne, FPOne, FPOne, FPOne);
|
|
|
|
}
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue Est = DAG.getNode(PPCISD::FRE, dl, VT, Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(Est.getNode());
|
|
|
|
|
|
|
|
// Newton iterations: Est = Est + Est (1 - Arg * Est)
|
|
|
|
for (int i = 0; i < Iterations; ++i) {
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Op, Est);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPOne, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
Est = DAG.getNode(ISD::FADD, dl, VT, Est, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(Est.getNode());
|
|
|
|
}
|
|
|
|
|
|
|
|
return Est;
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue PPCTargetLowering::DAGCombineFastRecipFSQRT(SDValue Op,
|
2013-04-03 04:01:11 +00:00
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
if (DCI.isAfterLegalizeVectorOps())
|
|
|
|
return SDValue();
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
EVT VT = Op.getValueType();
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
if ((VT == MVT::f32 && Subtarget.hasFRSQRTES()) ||
|
|
|
|
(VT == MVT::f64 && Subtarget.hasFRSQRTE()) ||
|
|
|
|
(VT == MVT::v4f32 && Subtarget.hasAltivec()) ||
|
|
|
|
(VT == MVT::v2f64 && Subtarget.hasVSX())) {
|
2013-04-03 04:01:11 +00:00
|
|
|
|
|
|
|
// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
|
|
|
|
// For the reciprocal sqrt, we need to find the zero of the function:
|
|
|
|
// F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
|
|
|
|
// =>
|
|
|
|
// X_{i+1} = X_i (1.5 - A X_i^2 / 2)
|
|
|
|
// As a result, we precompute A/2 prior to the iteration loop.
|
|
|
|
|
|
|
|
// Convergence is quadratic, so we essentially double the number of digits
|
|
|
|
// correct after every iteration. The minimum architected relative
|
|
|
|
// accuracy is 2^-5. When hasRecipPrec(), this is 2^-14. IEEE float has
|
|
|
|
// 23 digits and double has 52 digits.
|
2014-06-12 22:38:18 +00:00
|
|
|
int Iterations = Subtarget.hasRecipPrec() ? 1 : 3;
|
2013-04-03 17:44:56 +00:00
|
|
|
if (VT.getScalarType() == MVT::f64)
|
2013-04-03 04:01:11 +00:00
|
|
|
++Iterations;
|
|
|
|
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue FPThreeHalves =
|
|
|
|
DAG.getConstantFP(1.5, VT.getScalarType());
|
|
|
|
if (VT.isVector()) {
|
|
|
|
assert(VT.getVectorNumElements() == 4 &&
|
2013-04-03 04:01:11 +00:00
|
|
|
"Unknown vector type");
|
2013-04-03 17:44:56 +00:00
|
|
|
FPThreeHalves = DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
|
|
|
|
FPThreeHalves, FPThreeHalves,
|
|
|
|
FPThreeHalves, FPThreeHalves);
|
2013-04-03 04:01:11 +00:00
|
|
|
}
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue Est = DAG.getNode(PPCISD::FRSQRTE, dl, VT, Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(Est.getNode());
|
|
|
|
|
|
|
|
// We now need 0.5*Arg which we can write as (1.5*Arg - Arg) so that
|
|
|
|
// this entire sequence requires only one FP constant.
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue HalfArg = DAG.getNode(ISD::FMUL, dl, VT, FPThreeHalves, Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(HalfArg.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
HalfArg = DAG.getNode(ISD::FSUB, dl, VT, HalfArg, Op);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(HalfArg.getNode());
|
|
|
|
|
|
|
|
// Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
|
|
|
|
for (int i = 0; i < Iterations; ++i) {
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue NewEst = DAG.getNode(ISD::FMUL, dl, VT, Est, Est);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
NewEst = DAG.getNode(ISD::FMUL, dl, VT, HalfArg, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
NewEst = DAG.getNode(ISD::FSUB, dl, VT, FPThreeHalves, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(NewEst.getNode());
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
Est = DAG.getNode(ISD::FMUL, dl, VT, Est, NewEst);
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(Est.getNode());
|
|
|
|
}
|
|
|
|
|
|
|
|
return Est;
|
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
|
2013-05-27 02:06:39 +00:00
|
|
|
// Like SelectionDAG::isConsecutiveLoad, but also works for stores, and does
|
|
|
|
// not enforce equality of the chain operands.
|
|
|
|
static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
|
|
|
|
unsigned Bytes, int Dist,
|
|
|
|
SelectionDAG &DAG) {
|
|
|
|
EVT VT = LS->getMemoryVT();
|
|
|
|
if (VT.getSizeInBits() / 8 != Bytes)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
SDValue Loc = LS->getBasePtr();
|
|
|
|
SDValue BaseLoc = Base->getBasePtr();
|
|
|
|
if (Loc.getOpcode() == ISD::FrameIndex) {
|
|
|
|
if (BaseLoc.getOpcode() != ISD::FrameIndex)
|
|
|
|
return false;
|
|
|
|
const MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
|
|
|
|
int FI = cast<FrameIndexSDNode>(Loc)->getIndex();
|
|
|
|
int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
|
|
|
|
int FS = MFI->getObjectSize(FI);
|
|
|
|
int BFS = MFI->getObjectSize(BFI);
|
|
|
|
if (FS != BFS || FS != (int)Bytes) return false;
|
|
|
|
return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Handle X+C
|
|
|
|
if (DAG.isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
|
|
|
|
cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
2014-04-25 05:30:21 +00:00
|
|
|
const GlobalValue *GV1 = nullptr;
|
|
|
|
const GlobalValue *GV2 = nullptr;
|
2013-05-27 02:06:39 +00:00
|
|
|
int64_t Offset1 = 0;
|
|
|
|
int64_t Offset2 = 0;
|
|
|
|
bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
|
|
|
|
bool isGA2 = TLI.isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
|
|
|
|
if (isGA1 && isGA2 && GV1 == GV2)
|
|
|
|
return Offset1 == (Offset2 + Dist*Bytes);
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).
This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182719 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-26 18:08:30 +00:00
|
|
|
// Return true is there is a nearyby consecutive load to the one provided
|
|
|
|
// (regardless of alignment). We search up and down the chain, looking though
|
|
|
|
// token factors and other loads (but nothing else). As a result, a true
|
|
|
|
// results indicates that it is safe to create a new consecutive load adjacent
|
|
|
|
// to the load provided.
|
|
|
|
static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
EVT VT = LD->getMemoryVT();
|
|
|
|
|
|
|
|
SmallSet<SDNode *, 16> LoadRoots;
|
|
|
|
SmallVector<SDNode *, 8> Queue(1, Chain.getNode());
|
|
|
|
SmallSet<SDNode *, 16> Visited;
|
|
|
|
|
|
|
|
// First, search up the chain, branching to follow all token-factor operands.
|
|
|
|
// If we find a consecutive load, then we're done, otherwise, record all
|
|
|
|
// nodes just above the top-level loads and token factors.
|
|
|
|
while (!Queue.empty()) {
|
|
|
|
SDNode *ChainNext = Queue.pop_back_val();
|
|
|
|
if (!Visited.insert(ChainNext))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(ChainNext)) {
|
2013-05-27 02:06:39 +00:00
|
|
|
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
|
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).
This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182719 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-26 18:08:30 +00:00
|
|
|
return true;
|
|
|
|
|
|
|
|
if (!Visited.count(ChainLD->getChain().getNode()))
|
|
|
|
Queue.push_back(ChainLD->getChain().getNode());
|
|
|
|
} else if (ChainNext->getOpcode() == ISD::TokenFactor) {
|
2014-06-29 00:40:57 +00:00
|
|
|
for (const SDUse &O : ChainNext->ops())
|
|
|
|
if (!Visited.count(O.getNode()))
|
|
|
|
Queue.push_back(O.getNode());
|
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).
This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182719 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-26 18:08:30 +00:00
|
|
|
} else
|
|
|
|
LoadRoots.insert(ChainNext);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Second, search down the chain, starting from the top-level nodes recorded
|
|
|
|
// in the first phase. These top-level nodes are the nodes just above all
|
|
|
|
// loads and token factors. Starting with their uses, recursively look though
|
|
|
|
// all loads (just the chain uses) and token factors to find a consecutive
|
|
|
|
// load.
|
|
|
|
Visited.clear();
|
|
|
|
Queue.clear();
|
|
|
|
|
|
|
|
for (SmallSet<SDNode *, 16>::iterator I = LoadRoots.begin(),
|
|
|
|
IE = LoadRoots.end(); I != IE; ++I) {
|
|
|
|
Queue.push_back(*I);
|
|
|
|
|
|
|
|
while (!Queue.empty()) {
|
|
|
|
SDNode *LoadRoot = Queue.pop_back_val();
|
|
|
|
if (!Visited.insert(LoadRoot))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (LoadSDNode *ChainLD = dyn_cast<LoadSDNode>(LoadRoot))
|
2013-05-27 02:06:39 +00:00
|
|
|
if (isConsecutiveLS(ChainLD, LD, VT.getStoreSize(), 1, DAG))
|
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).
This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182719 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-26 18:08:30 +00:00
|
|
|
return true;
|
|
|
|
|
|
|
|
for (SDNode::use_iterator UI = LoadRoot->use_begin(),
|
|
|
|
UE = LoadRoot->use_end(); UI != UE; ++UI)
|
|
|
|
if (((isa<LoadSDNode>(*UI) &&
|
|
|
|
cast<LoadSDNode>(*UI)->getChain().getNode() == LoadRoot) ||
|
|
|
|
UI->getOpcode() == ISD::TokenFactor) && !Visited.count(*UI))
|
|
|
|
Queue.push_back(*UI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc dl(N);
|
|
|
|
|
2014-06-12 22:38:18 +00:00
|
|
|
assert(Subtarget.useCRBits() &&
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
"Expecting to be tracking CR bits");
|
|
|
|
// If we're tracking CR bits, we need to be careful that we don't have:
|
|
|
|
// trunc(binary-ops(zext(x), zext(y)))
|
|
|
|
// or
|
|
|
|
// trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
|
|
|
|
// such that we're unnecessarily moving things into GPRs when it would be
|
|
|
|
// better to keep them in CR bits.
|
|
|
|
|
|
|
|
// Note that trunc here can be an actual i1 trunc, or can be the effective
|
|
|
|
// truncation that comes from a setcc or select_cc.
|
|
|
|
if (N->getOpcode() == ISD::TRUNCATE &&
|
|
|
|
N->getValueType(0) != MVT::i1)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (N->getOperand(0).getValueType() != MVT::i32 &&
|
|
|
|
N->getOperand(0).getValueType() != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (N->getOpcode() == ISD::SETCC ||
|
|
|
|
N->getOpcode() == ISD::SELECT_CC) {
|
|
|
|
// If we're looking at a comparison, then we need to make sure that the
|
|
|
|
// high bits (all except for the first) don't matter the result.
|
|
|
|
ISD::CondCode CC =
|
|
|
|
cast<CondCodeSDNode>(N->getOperand(
|
|
|
|
N->getOpcode() == ISD::SETCC ? 2 : 4))->get();
|
|
|
|
unsigned OpBits = N->getOperand(0).getValueSizeInBits();
|
|
|
|
|
|
|
|
if (ISD::isSignedIntSetCC(CC)) {
|
|
|
|
if (DAG.ComputeNumSignBits(N->getOperand(0)) != OpBits ||
|
|
|
|
DAG.ComputeNumSignBits(N->getOperand(1)) != OpBits)
|
|
|
|
return SDValue();
|
|
|
|
} else if (ISD::isUnsignedIntSetCC(CC)) {
|
|
|
|
if (!DAG.MaskedValueIsZero(N->getOperand(0),
|
|
|
|
APInt::getHighBitsSet(OpBits, OpBits-1)) ||
|
|
|
|
!DAG.MaskedValueIsZero(N->getOperand(1),
|
|
|
|
APInt::getHighBitsSet(OpBits, OpBits-1)))
|
|
|
|
return SDValue();
|
|
|
|
} else {
|
|
|
|
// This is neither a signed nor an unsigned comparison, just make sure
|
|
|
|
// that the high bits are equal.
|
|
|
|
APInt Op1Zero, Op1One;
|
|
|
|
APInt Op2Zero, Op2One;
|
2014-05-14 21:14:37 +00:00
|
|
|
DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
|
|
|
|
DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
|
|
|
|
// We don't really care about what is known about the first bit (if
|
|
|
|
// anything), so clear it in all masks prior to comparing them.
|
|
|
|
Op1Zero.clearBit(0); Op1One.clearBit(0);
|
|
|
|
Op2Zero.clearBit(0); Op2One.clearBit(0);
|
|
|
|
|
|
|
|
if (Op1Zero != Op2Zero || Op1One != Op2One)
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We now know that the higher-order bits are irrelevant, we just need to
|
|
|
|
// make sure that all of the intermediate operations are bit operations, and
|
|
|
|
// all inputs are extensions.
|
|
|
|
if (N->getOperand(0).getOpcode() != ISD::AND &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::OR &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::XOR &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::SELECT &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::SELECT_CC &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::TRUNCATE &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::SIGN_EXTEND &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::ZERO_EXTEND &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::ANY_EXTEND)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if ((N->getOpcode() == ISD::SETCC || N->getOpcode() == ISD::SELECT_CC) &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::AND &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::OR &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::XOR &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::SELECT &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::SELECT_CC &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::TRUNCATE &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::SIGN_EXTEND &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::ZERO_EXTEND &&
|
|
|
|
N->getOperand(1).getOpcode() != ISD::ANY_EXTEND)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Inputs;
|
|
|
|
SmallVector<SDValue, 8> BinOps, PromOps;
|
|
|
|
SmallPtrSet<SDNode *, 16> Visited;
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
|
|
if (((N->getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
|
|
N->getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
|
|
N->getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
|
|
|
|
N->getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
|
|
|
|
isa<ConstantSDNode>(N->getOperand(i)))
|
|
|
|
Inputs.push_back(N->getOperand(i));
|
|
|
|
else
|
|
|
|
BinOps.push_back(N->getOperand(i));
|
|
|
|
|
|
|
|
if (N->getOpcode() == ISD::TRUNCATE)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Visit all inputs, collect all binary operations (and, or, xor and
|
|
|
|
// select) that are all fed by extensions.
|
|
|
|
while (!BinOps.empty()) {
|
|
|
|
SDValue BinOp = BinOps.back();
|
|
|
|
BinOps.pop_back();
|
|
|
|
|
|
|
|
if (!Visited.insert(BinOp.getNode()))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
PromOps.push_back(BinOp);
|
|
|
|
|
|
|
|
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
|
|
|
|
// The condition of the select is not promoted.
|
|
|
|
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
|
|
|
|
continue;
|
|
|
|
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (((BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) &&
|
|
|
|
BinOp.getOperand(i).getOperand(0).getValueType() == MVT::i1) ||
|
|
|
|
isa<ConstantSDNode>(BinOp.getOperand(i))) {
|
|
|
|
Inputs.push_back(BinOp.getOperand(i));
|
|
|
|
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::OR ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::SIGN_EXTEND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::ZERO_EXTEND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::ANY_EXTEND) {
|
|
|
|
BinOps.push_back(BinOp.getOperand(i));
|
|
|
|
} else {
|
|
|
|
// We have an input that is not an extension or another binary
|
|
|
|
// operation; we'll abort this transformation.
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure that this is a self-contained cluster of operations (which
|
|
|
|
// is not quite the same thing as saying that everything has only one
|
|
|
|
// use).
|
|
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
|
|
|
|
UE = Inputs[i].getNode()->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (User != N && !Visited.count(User))
|
|
|
|
return SDValue();
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
|
|
// practice that one of the condition inputs to the select is also one of
|
|
|
|
// the outputs, we currently can't deal with this.
|
|
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
|
|
if (User->getOperand(0) == Inputs[i])
|
|
|
|
return SDValue();
|
|
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
|
|
if (User->getOperand(0) == Inputs[i] ||
|
|
|
|
User->getOperand(1) == Inputs[i])
|
|
|
|
return SDValue();
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
|
|
|
|
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
|
|
|
|
UE = PromOps[i].getNode()->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (User != N && !Visited.count(User))
|
|
|
|
return SDValue();
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
|
|
// practice that one of the condition inputs to the select is also one of
|
|
|
|
// the outputs, we currently can't deal with this.
|
|
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
|
|
if (User->getOperand(0) == PromOps[i])
|
|
|
|
return SDValue();
|
|
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
|
|
if (User->getOperand(0) == PromOps[i] ||
|
|
|
|
User->getOperand(1) == PromOps[i])
|
|
|
|
return SDValue();
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace all inputs with the extension operand.
|
|
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
|
|
// Constants may have users outside the cluster of to-be-promoted nodes,
|
|
|
|
// and so we need to replace those as we do the promotions.
|
|
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
|
|
continue;
|
|
|
|
else
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i], Inputs[i].getOperand(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace all operations (these are all the same, but have a different
|
|
|
|
// (i1) return type). DAG.getNode will validate that the types of
|
|
|
|
// a binary operator match, so go through the list in reverse so that
|
|
|
|
// we've likely promoted both operands first. Any intermediate truncations or
|
|
|
|
// extensions disappear.
|
|
|
|
while (!PromOps.empty()) {
|
|
|
|
SDValue PromOp = PromOps.back();
|
|
|
|
PromOps.pop_back();
|
|
|
|
|
|
|
|
if (PromOp.getOpcode() == ISD::TRUNCATE ||
|
|
|
|
PromOp.getOpcode() == ISD::SIGN_EXTEND ||
|
|
|
|
PromOp.getOpcode() == ISD::ZERO_EXTEND ||
|
|
|
|
PromOp.getOpcode() == ISD::ANY_EXTEND) {
|
|
|
|
if (!isa<ConstantSDNode>(PromOp.getOperand(0)) &&
|
|
|
|
PromOp.getOperand(0).getValueType() != MVT::i1) {
|
|
|
|
// The operand is not yet ready (see comment below).
|
|
|
|
PromOps.insert(PromOps.begin(), PromOp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue RepValue = PromOp.getOperand(0);
|
|
|
|
if (isa<ConstantSDNode>(RepValue))
|
|
|
|
RepValue = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, RepValue);
|
|
|
|
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp, RepValue);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned C;
|
|
|
|
switch (PromOp.getOpcode()) {
|
|
|
|
default: C = 0; break;
|
|
|
|
case ISD::SELECT: C = 1; break;
|
|
|
|
case ISD::SELECT_CC: C = 2; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
|
|
|
|
PromOp.getOperand(C).getValueType() != MVT::i1) ||
|
|
|
|
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
|
|
|
|
PromOp.getOperand(C+1).getValueType() != MVT::i1)) {
|
|
|
|
// The to-be-promoted operands of this node have not yet been
|
|
|
|
// promoted (this should be rare because we're going through the
|
|
|
|
// list backward, but if one of the operands has several users in
|
|
|
|
// this cluster of to-be-promoted nodes, it is possible).
|
|
|
|
PromOps.insert(PromOps.begin(), PromOp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
|
|
|
|
PromOp.getNode()->op_end());
|
|
|
|
|
|
|
|
// If there are any constant inputs, make sure they're replaced now.
|
|
|
|
for (unsigned i = 0; i < 2; ++i)
|
|
|
|
if (isa<ConstantSDNode>(Ops[C+i]))
|
|
|
|
Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
|
|
|
|
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp,
|
2014-04-26 18:35:24 +00:00
|
|
|
DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Now we're left with the initial truncation itself.
|
|
|
|
if (N->getOpcode() == ISD::TRUNCATE)
|
|
|
|
return N->getOperand(0);
|
|
|
|
|
|
|
|
// Otherwise, this is a comparison. The operands to be compared have just
|
|
|
|
// changed type (to i1), but everything else is the same.
|
|
|
|
return SDValue(N, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
|
|
|
SDLoc dl(N);
|
|
|
|
|
|
|
|
// If we're tracking CR bits, we need to be careful that we don't have:
|
|
|
|
// zext(binary-ops(trunc(x), trunc(y)))
|
|
|
|
// or
|
|
|
|
// zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
|
|
|
|
// such that we're unnecessarily moving things into CR bits that can more
|
|
|
|
// efficiently stay in GPRs. Note that if we're not certain that the high
|
|
|
|
// bits are set as required by the final extension, we still may need to do
|
|
|
|
// some masking to get the proper behavior.
|
|
|
|
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
// This same functionality is important on PPC64 when dealing with
|
|
|
|
// 32-to-64-bit extensions; these occur often when 32-bit values are used as
|
|
|
|
// the return values of functions. Because it is so similar, it is handled
|
|
|
|
// here as well.
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (N->getValueType(0) != MVT::i32 &&
|
|
|
|
N->getValueType(0) != MVT::i64)
|
|
|
|
return SDValue();
|
|
|
|
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
if (!((N->getOperand(0).getValueType() == MVT::i1 &&
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget.useCRBits()) ||
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
(N->getOperand(0).getValueType() == MVT::i32 &&
|
2014-06-12 22:38:18 +00:00
|
|
|
Subtarget.isPPC64())))
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
if (N->getOperand(0).getOpcode() != ISD::AND &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::OR &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::XOR &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::SELECT &&
|
|
|
|
N->getOperand(0).getOpcode() != ISD::SELECT_CC)
|
|
|
|
return SDValue();
|
|
|
|
|
|
|
|
SmallVector<SDValue, 4> Inputs;
|
|
|
|
SmallVector<SDValue, 8> BinOps(1, N->getOperand(0)), PromOps;
|
|
|
|
SmallPtrSet<SDNode *, 16> Visited;
|
|
|
|
|
|
|
|
// Visit all inputs, collect all binary operations (and, or, xor and
|
|
|
|
// select) that are all fed by truncations.
|
|
|
|
while (!BinOps.empty()) {
|
|
|
|
SDValue BinOp = BinOps.back();
|
|
|
|
BinOps.pop_back();
|
|
|
|
|
|
|
|
if (!Visited.insert(BinOp.getNode()))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
PromOps.push_back(BinOp);
|
|
|
|
|
|
|
|
for (unsigned i = 0, ie = BinOp.getNumOperands(); i != ie; ++i) {
|
|
|
|
// The condition of the select is not promoted.
|
|
|
|
if (BinOp.getOpcode() == ISD::SELECT && i == 0)
|
|
|
|
continue;
|
|
|
|
if (BinOp.getOpcode() == ISD::SELECT_CC && i != 2 && i != 3)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (BinOp.getOperand(i).getOpcode() == ISD::TRUNCATE ||
|
|
|
|
isa<ConstantSDNode>(BinOp.getOperand(i))) {
|
|
|
|
Inputs.push_back(BinOp.getOperand(i));
|
|
|
|
} else if (BinOp.getOperand(i).getOpcode() == ISD::AND ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::OR ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::XOR ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT ||
|
|
|
|
BinOp.getOperand(i).getOpcode() == ISD::SELECT_CC) {
|
|
|
|
BinOps.push_back(BinOp.getOperand(i));
|
|
|
|
} else {
|
|
|
|
// We have an input that is not a truncation or another binary
|
|
|
|
// operation; we'll abort this transformation.
|
|
|
|
return SDValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make sure that this is a self-contained cluster of operations (which
|
|
|
|
// is not quite the same thing as saying that everything has only one
|
|
|
|
// use).
|
|
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
for (SDNode::use_iterator UI = Inputs[i].getNode()->use_begin(),
|
|
|
|
UE = Inputs[i].getNode()->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (User != N && !Visited.count(User))
|
|
|
|
return SDValue();
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
|
|
// practice that one of the condition inputs to the select is also one of
|
|
|
|
// the outputs, we currently can't deal with this.
|
|
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
|
|
if (User->getOperand(0) == Inputs[i])
|
|
|
|
return SDValue();
|
|
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
|
|
if (User->getOperand(0) == Inputs[i] ||
|
|
|
|
User->getOperand(1) == Inputs[i])
|
|
|
|
return SDValue();
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (unsigned i = 0, ie = PromOps.size(); i != ie; ++i) {
|
|
|
|
for (SDNode::use_iterator UI = PromOps[i].getNode()->use_begin(),
|
|
|
|
UE = PromOps[i].getNode()->use_end();
|
|
|
|
UI != UE; ++UI) {
|
|
|
|
SDNode *User = *UI;
|
|
|
|
if (User != N && !Visited.count(User))
|
|
|
|
return SDValue();
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
|
|
|
|
// Make sure that we're not going to promote the non-output-value
|
|
|
|
// operand(s) or SELECT or SELECT_CC.
|
|
|
|
// FIXME: Although we could sometimes handle this, and it does occur in
|
|
|
|
// practice that one of the condition inputs to the select is also one of
|
|
|
|
// the outputs, we currently can't deal with this.
|
|
|
|
if (User->getOpcode() == ISD::SELECT) {
|
|
|
|
if (User->getOperand(0) == PromOps[i])
|
|
|
|
return SDValue();
|
|
|
|
} else if (User->getOpcode() == ISD::SELECT_CC) {
|
|
|
|
if (User->getOperand(0) == PromOps[i] ||
|
|
|
|
User->getOperand(1) == PromOps[i])
|
|
|
|
return SDValue();
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
unsigned PromBits = N->getOperand(0).getValueSizeInBits();
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
bool ReallyNeedsExt = false;
|
|
|
|
if (N->getOpcode() != ISD::ANY_EXTEND) {
|
|
|
|
// If all of the inputs are not already sign/zero extended, then
|
|
|
|
// we'll still need to do that at the end.
|
|
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
unsigned OpBits =
|
|
|
|
Inputs[i].getOperand(0).getValueSizeInBits();
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
assert(PromBits < OpBits && "Truncation not to a smaller bit count?");
|
|
|
|
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if ((N->getOpcode() == ISD::ZERO_EXTEND &&
|
|
|
|
!DAG.MaskedValueIsZero(Inputs[i].getOperand(0),
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
APInt::getHighBitsSet(OpBits,
|
|
|
|
OpBits-PromBits))) ||
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
(N->getOpcode() == ISD::SIGN_EXTEND &&
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
DAG.ComputeNumSignBits(Inputs[i].getOperand(0)) <
|
|
|
|
(OpBits-(PromBits-1)))) {
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
ReallyNeedsExt = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace all inputs, either with the truncation operand, or a
|
|
|
|
// truncation or extension to the final output type.
|
|
|
|
for (unsigned i = 0, ie = Inputs.size(); i != ie; ++i) {
|
|
|
|
// Constant inputs need to be replaced with the to-be-promoted nodes that
|
|
|
|
// use them because they might have users outside of the cluster of
|
|
|
|
// promoted nodes.
|
|
|
|
if (isa<ConstantSDNode>(Inputs[i]))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
SDValue InSrc = Inputs[i].getOperand(0);
|
|
|
|
if (Inputs[i].getValueType() == N->getValueType(0))
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i], InSrc);
|
|
|
|
else if (N->getOpcode() == ISD::SIGN_EXTEND)
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
|
|
DAG.getSExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
|
|
else if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
|
|
DAG.getZExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
|
|
else
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Inputs[i],
|
|
|
|
DAG.getAnyExtOrTrunc(InSrc, dl, N->getValueType(0)));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Replace all operations (these are all the same, but have a different
|
|
|
|
// (promoted) return type). DAG.getNode will validate that the types of
|
|
|
|
// a binary operator match, so go through the list in reverse so that
|
|
|
|
// we've likely promoted both operands first.
|
|
|
|
while (!PromOps.empty()) {
|
|
|
|
SDValue PromOp = PromOps.back();
|
|
|
|
PromOps.pop_back();
|
|
|
|
|
|
|
|
unsigned C;
|
|
|
|
switch (PromOp.getOpcode()) {
|
|
|
|
default: C = 0; break;
|
|
|
|
case ISD::SELECT: C = 1; break;
|
|
|
|
case ISD::SELECT_CC: C = 2; break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((!isa<ConstantSDNode>(PromOp.getOperand(C)) &&
|
|
|
|
PromOp.getOperand(C).getValueType() != N->getValueType(0)) ||
|
|
|
|
(!isa<ConstantSDNode>(PromOp.getOperand(C+1)) &&
|
|
|
|
PromOp.getOperand(C+1).getValueType() != N->getValueType(0))) {
|
|
|
|
// The to-be-promoted operands of this node have not yet been
|
|
|
|
// promoted (this should be rare because we're going through the
|
|
|
|
// list backward, but if one of the operands has several users in
|
|
|
|
// this cluster of to-be-promoted nodes, it is possible).
|
|
|
|
PromOps.insert(PromOps.begin(), PromOp);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
SmallVector<SDValue, 3> Ops(PromOp.getNode()->op_begin(),
|
|
|
|
PromOp.getNode()->op_end());
|
|
|
|
|
|
|
|
// If this node has constant inputs, then they'll need to be promoted here.
|
|
|
|
for (unsigned i = 0; i < 2; ++i) {
|
|
|
|
if (!isa<ConstantSDNode>(Ops[C+i]))
|
|
|
|
continue;
|
|
|
|
if (Ops[C+i].getValueType() == N->getValueType(0))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (N->getOpcode() == ISD::SIGN_EXTEND)
|
|
|
|
Ops[C+i] = DAG.getSExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
|
|
else if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
|
|
Ops[C+i] = DAG.getZExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
|
|
else
|
|
|
|
Ops[C+i] = DAG.getAnyExtOrTrunc(Ops[C+i], dl, N->getValueType(0));
|
|
|
|
}
|
|
|
|
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(PromOp,
|
2014-04-26 18:35:24 +00:00
|
|
|
DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Now we're left with the initial extension itself.
|
|
|
|
if (!ReallyNeedsExt)
|
|
|
|
return N->getOperand(0);
|
|
|
|
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
// To zero extend, just mask off everything except for the first bit (in the
|
|
|
|
// i1 case).
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
if (N->getOpcode() == ISD::ZERO_EXTEND)
|
|
|
|
return DAG.getNode(ISD::AND, dl, N->getValueType(0), N->getOperand(0),
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
DAG.getConstant(APInt::getLowBitsSet(
|
|
|
|
N->getValueSizeInBits(0), PromBits),
|
|
|
|
N->getValueType(0)));
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
|
|
|
|
assert(N->getOpcode() == ISD::SIGN_EXTEND &&
|
|
|
|
"Invalid extension type");
|
|
|
|
EVT ShiftAmountTy = getShiftAmountTy(N->getValueType(0));
|
|
|
|
SDValue ShiftCst =
|
Remove extra truncs/exts around i32 bit operations on PPC64
This generalizes the code to eliminate extra truncs/exts around i1 bit
operations to also do the same on PPC64 for i32 bit operations. This eliminates
a fairly prevalent code wart:
int foo(int a) {
return a == 5 ? 7 : 8;
}
On PPC64, because of the extension implied by the ABI, this would generate:
cmplwi 0, 3, 5
li 12, 8
li 4, 7
isel 3, 4, 12, 2
rldicl 3, 3, 0, 32
blr
where the 'rldicl 3, 3, 0, 32', the extension, is completely unnecessary. At
least for the single-BB case (which is all that the DAG combine mechanism can
handle), this unnecessary extension is no longer generated.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202600 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-01 21:36:57 +00:00
|
|
|
DAG.getConstant(N->getValueSizeInBits(0)-PromBits, ShiftAmountTy);
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
return DAG.getNode(ISD::SRA, dl, N->getValueType(0),
|
|
|
|
DAG.getNode(ISD::SHL, dl, N->getValueType(0),
|
|
|
|
N->getOperand(0), ShiftCst), ShiftCst);
|
|
|
|
}
|
|
|
|
|
2008-11-24 14:53:14 +00:00
|
|
|
SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
|
|
|
|
DAGCombinerInfo &DCI) const {
|
2010-04-21 01:34:56 +00:00
|
|
|
const TargetMachine &TM = getTargetMachine();
|
2006-03-01 04:57:39 +00:00
|
|
|
SelectionDAG &DAG = DCI.DAG;
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(N);
|
2006-03-01 04:57:39 +00:00
|
|
|
switch (N->getOpcode()) {
|
|
|
|
default: break;
|
Fold the PPCISD shifts when presented with 0 inputs. This occurs for code
like:
long long test(long long X, int Y) {
return 1ULL << Y;
}
long long test2(long long X, int Y) {
return -1LL << Y;
}
which we used to compile to:
_test:
li r2, 1
subfic r3, r5, 32
li r4, 0
addi r6, r5, -32
srw r3, r2, r3
slw r4, r4, r5
slw r6, r2, r6
or r3, r4, r3
slw r4, r2, r5
or r3, r3, r6
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
Now we produce:
_test:
li r2, 1
addi r3, r5, -32
subfic r4, r5, 32
slw r3, r2, r3
srw r4, r2, r4
or r3, r4, r3
slw r4, r2, r5
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30479 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-19 05:22:59 +00:00
|
|
|
case PPCISD::SHL:
|
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
|
2010-06-18 14:22:04 +00:00
|
|
|
if (C->isNullValue()) // 0 << V -> 0.
|
Fold the PPCISD shifts when presented with 0 inputs. This occurs for code
like:
long long test(long long X, int Y) {
return 1ULL << Y;
}
long long test2(long long X, int Y) {
return -1LL << Y;
}
which we used to compile to:
_test:
li r2, 1
subfic r3, r5, 32
li r4, 0
addi r6, r5, -32
srw r3, r2, r3
slw r4, r4, r5
slw r6, r2, r6
or r3, r4, r3
slw r4, r2, r5
or r3, r3, r6
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
Now we produce:
_test:
li r2, 1
addi r3, r5, -32
subfic r4, r5, 32
slw r3, r2, r3
srw r4, r2, r4
or r3, r4, r3
slw r4, r2, r5
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30479 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-19 05:22:59 +00:00
|
|
|
return N->getOperand(0);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case PPCISD::SRL:
|
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
|
2010-06-18 14:22:04 +00:00
|
|
|
if (C->isNullValue()) // 0 >>u V -> 0.
|
Fold the PPCISD shifts when presented with 0 inputs. This occurs for code
like:
long long test(long long X, int Y) {
return 1ULL << Y;
}
long long test2(long long X, int Y) {
return -1LL << Y;
}
which we used to compile to:
_test:
li r2, 1
subfic r3, r5, 32
li r4, 0
addi r6, r5, -32
srw r3, r2, r3
slw r4, r4, r5
slw r6, r2, r6
or r3, r4, r3
slw r4, r2, r5
or r3, r3, r6
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
Now we produce:
_test:
li r2, 1
addi r3, r5, -32
subfic r4, r5, 32
slw r3, r2, r3
srw r4, r2, r4
or r3, r4, r3
slw r4, r2, r5
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30479 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-19 05:22:59 +00:00
|
|
|
return N->getOperand(0);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case PPCISD::SRA:
|
|
|
|
if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
|
2010-06-18 14:22:04 +00:00
|
|
|
if (C->isNullValue() || // 0 >>s V -> 0.
|
Fold the PPCISD shifts when presented with 0 inputs. This occurs for code
like:
long long test(long long X, int Y) {
return 1ULL << Y;
}
long long test2(long long X, int Y) {
return -1LL << Y;
}
which we used to compile to:
_test:
li r2, 1
subfic r3, r5, 32
li r4, 0
addi r6, r5, -32
srw r3, r2, r3
slw r4, r4, r5
slw r6, r2, r6
or r3, r4, r3
slw r4, r2, r5
or r3, r3, r6
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
Now we produce:
_test:
li r2, 1
addi r3, r5, -32
subfic r4, r5, 32
slw r3, r2, r3
srw r4, r2, r4
or r3, r4, r3
slw r4, r2, r5
blr
_test2:
li r2, -1
subfic r3, r5, 32
addi r6, r5, -32
srw r3, r2, r3
slw r4, r2, r5
slw r2, r2, r6
or r3, r4, r3
or r3, r3, r2
blr
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@30479 91177308-0d34-0410-b5e6-96231b3b80d8
2006-09-19 05:22:59 +00:00
|
|
|
C->isAllOnesValue()) // -1 >>s V -> -1.
|
|
|
|
return N->getOperand(0);
|
|
|
|
}
|
|
|
|
break;
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case ISD::SIGN_EXTEND:
|
|
|
|
case ISD::ZERO_EXTEND:
|
|
|
|
case ISD::ANY_EXTEND:
|
|
|
|
return DAGCombineExtBoolTrunc(N, DCI);
|
|
|
|
case ISD::TRUNCATE:
|
|
|
|
case ISD::SETCC:
|
|
|
|
case ISD::SELECT_CC:
|
|
|
|
return DAGCombineTruncBoolExt(N, DCI);
|
2013-04-03 04:01:11 +00:00
|
|
|
case ISD::FDIV: {
|
|
|
|
assert(TM.Options.UnsafeFPMath &&
|
|
|
|
"Reciprocal estimates require UnsafeFPMath");
|
|
|
|
|
|
|
|
if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue RV =
|
|
|
|
DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(RV.getNode());
|
|
|
|
return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
|
|
|
|
N->getOperand(0), RV);
|
|
|
|
}
|
2013-04-04 22:44:12 +00:00
|
|
|
} else if (N->getOperand(1).getOpcode() == ISD::FP_EXTEND &&
|
|
|
|
N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
|
|
|
|
SDValue RV =
|
|
|
|
DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
|
|
|
|
DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2013-04-04 22:44:12 +00:00
|
|
|
DCI.AddToWorklist(RV.getNode());
|
2013-05-25 02:42:55 +00:00
|
|
|
RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
|
2013-04-04 22:44:12 +00:00
|
|
|
N->getValueType(0), RV);
|
|
|
|
DCI.AddToWorklist(RV.getNode());
|
|
|
|
return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
|
|
|
|
N->getOperand(0), RV);
|
|
|
|
}
|
|
|
|
} else if (N->getOperand(1).getOpcode() == ISD::FP_ROUND &&
|
|
|
|
N->getOperand(1).getOperand(0).getOpcode() == ISD::FSQRT) {
|
|
|
|
SDValue RV =
|
|
|
|
DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
|
|
|
|
DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2013-04-04 22:44:12 +00:00
|
|
|
DCI.AddToWorklist(RV.getNode());
|
2013-05-25 02:42:55 +00:00
|
|
|
RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
|
2013-04-04 22:44:12 +00:00
|
|
|
N->getValueType(0), RV,
|
|
|
|
N->getOperand(1).getOperand(1));
|
|
|
|
DCI.AddToWorklist(RV.getNode());
|
|
|
|
return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
|
|
|
|
N->getOperand(0), RV);
|
|
|
|
}
|
2013-04-03 04:01:11 +00:00
|
|
|
}
|
|
|
|
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(RV.getNode());
|
|
|
|
return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
|
|
|
|
N->getOperand(0), RV);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case ISD::FSQRT: {
|
|
|
|
assert(TM.Options.UnsafeFPMath &&
|
|
|
|
"Reciprocal estimates require UnsafeFPMath");
|
|
|
|
|
|
|
|
// Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
|
|
|
|
// reciprocal sqrt.
|
2013-04-03 17:44:56 +00:00
|
|
|
SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2013-04-03 04:01:11 +00:00
|
|
|
DCI.AddToWorklist(RV.getNode());
|
2013-04-03 17:44:56 +00:00
|
|
|
RV = DAGCombineFastRecip(RV, DCI);
|
2014-04-25 05:30:21 +00:00
|
|
|
if (RV.getNode()) {
|
2014-05-30 22:47:48 +00:00
|
|
|
// Unfortunately, RV is now NaN if the input was exactly 0. Select out
|
|
|
|
// this case and force the answer to 0.
|
2013-09-12 19:04:12 +00:00
|
|
|
|
|
|
|
EVT VT = RV.getValueType();
|
|
|
|
|
|
|
|
SDValue Zero = DAG.getConstantFP(0.0, VT.getScalarType());
|
|
|
|
if (VT.isVector()) {
|
|
|
|
assert(VT.getVectorNumElements() == 4 && "Unknown vector type");
|
|
|
|
Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Zero, Zero, Zero, Zero);
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue ZeroCmp =
|
|
|
|
DAG.getSetCC(dl, getSetCCResultType(*DAG.getContext(), VT),
|
|
|
|
N->getOperand(0), Zero, ISD::SETEQ);
|
|
|
|
DCI.AddToWorklist(ZeroCmp.getNode());
|
|
|
|
DCI.AddToWorklist(RV.getNode());
|
|
|
|
|
|
|
|
RV = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, dl, VT,
|
|
|
|
ZeroCmp, Zero, RV);
|
2013-04-03 04:01:11 +00:00
|
|
|
return RV;
|
2013-09-12 19:04:12 +00:00
|
|
|
}
|
2013-04-03 04:01:11 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-04-03 04:01:11 +00:00
|
|
|
}
|
|
|
|
break;
|
2006-03-01 04:57:39 +00:00
|
|
|
case ISD::SINT_TO_FP:
|
2006-06-16 17:34:12 +00:00
|
|
|
if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
|
When possible, custom lower 32-bit SINT_TO_FP to this:
_foo2:
extsw r2, r3
std r2, -8(r1)
lfd f0, -8(r1)
fcfid f0, f0
frsp f1, f0
blr
instead of this:
_foo2:
lis r2, ha16(LCPI2_0)
lis r4, 17200
xoris r3, r3, 32768
stw r3, -4(r1)
stw r4, -8(r1)
lfs f0, lo16(LCPI2_0)(r2)
lfd f1, -8(r1)
fsub f0, f1, f0
frsp f1, f0
blr
This speeds up Misc/pi from 2.44s->2.09s with LLC and from 3.01->2.18s
with llcbeta (16.7% and 38.1% respectively).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26943 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-22 05:30:33 +00:00
|
|
|
if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
|
|
|
|
// Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
|
|
|
|
// We allow the src/dst to be either f32/f64, but the intermediate
|
|
|
|
// type must be i64.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (N->getOperand(0).getValueType() == MVT::i64 &&
|
|
|
|
N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Val = N->getOperand(0).getOperand(0);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Val.getValueType() == MVT::f32) {
|
|
|
|
Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
When possible, custom lower 32-bit SINT_TO_FP to this:
_foo2:
extsw r2, r3
std r2, -8(r1)
lfd f0, -8(r1)
fcfid f0, f0
frsp f1, f0
blr
instead of this:
_foo2:
lis r2, ha16(LCPI2_0)
lis r4, 17200
xoris r3, r3, 32768
stw r3, -4(r1)
stw r4, -8(r1)
lfs f0, lo16(LCPI2_0)(r2)
lfd f1, -8(r1)
fsub f0, f1, f0
frsp f1, f0
blr
This speeds up Misc/pi from 2.44s->2.09s with LLC and from 3.01->2.18s
with llcbeta (16.7% and 38.1% respectively).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26943 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-22 05:30:33 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
2009-08-11 20:47:22 +00:00
|
|
|
Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
2009-08-11 20:47:22 +00:00
|
|
|
if (N->getValueType(0) == MVT::f32) {
|
|
|
|
Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
|
2008-01-17 07:00:52 +00:00
|
|
|
DAG.getIntPtrConstant(0));
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
When possible, custom lower 32-bit SINT_TO_FP to this:
_foo2:
extsw r2, r3
std r2, -8(r1)
lfd f0, -8(r1)
fcfid f0, f0
frsp f1, f0
blr
instead of this:
_foo2:
lis r2, ha16(LCPI2_0)
lis r4, 17200
xoris r3, r3, 32768
stw r3, -4(r1)
stw r4, -8(r1)
lfs f0, lo16(LCPI2_0)(r2)
lfd f1, -8(r1)
fsub f0, f1, f0
frsp f1, f0
blr
This speeds up Misc/pi from 2.44s->2.09s with LLC and from 3.01->2.18s
with llcbeta (16.7% and 38.1% respectively).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26943 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-22 05:30:33 +00:00
|
|
|
}
|
|
|
|
return Val;
|
2009-08-11 20:47:22 +00:00
|
|
|
} else if (N->getOperand(0).getValueType() == MVT::i32) {
|
When possible, custom lower 32-bit SINT_TO_FP to this:
_foo2:
extsw r2, r3
std r2, -8(r1)
lfd f0, -8(r1)
fcfid f0, f0
frsp f1, f0
blr
instead of this:
_foo2:
lis r2, ha16(LCPI2_0)
lis r4, 17200
xoris r3, r3, 32768
stw r3, -4(r1)
stw r4, -8(r1)
lfs f0, lo16(LCPI2_0)(r2)
lfd f1, -8(r1)
fsub f0, f1, f0
frsp f1, f0
blr
This speeds up Misc/pi from 2.44s->2.09s with LLC and from 3.01->2.18s
with llcbeta (16.7% and 38.1% respectively).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@26943 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-22 05:30:33 +00:00
|
|
|
// If the intermediate type is i32, we can avoid the load/store here
|
|
|
|
// too.
|
2006-03-01 04:57:39 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2006-03-01 05:50:56 +00:00
|
|
|
case ISD::STORE:
|
|
|
|
// Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
|
|
|
|
if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
|
2008-01-18 16:54:56 +00:00
|
|
|
!cast<StoreSDNode>(N)->isTruncatingStore() &&
|
2006-03-01 05:50:56 +00:00
|
|
|
N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
|
2009-08-11 20:47:22 +00:00
|
|
|
N->getOperand(1).getValueType() == MVT::i32 &&
|
|
|
|
N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Val = N->getOperand(1).getOperand(0);
|
2009-08-11 20:47:22 +00:00
|
|
|
if (Val.getValueType() == MVT::f32) {
|
|
|
|
Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
2006-03-01 05:50:56 +00:00
|
|
|
}
|
2009-08-11 20:47:22 +00:00
|
|
|
Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
2006-03-01 05:50:56 +00:00
|
|
|
|
2013-04-01 15:37:53 +00:00
|
|
|
SDValue Ops[] = {
|
|
|
|
N->getOperand(0), Val, N->getOperand(2),
|
|
|
|
DAG.getValueType(N->getOperand(1).getValueType())
|
|
|
|
};
|
|
|
|
|
|
|
|
Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
|
2014-04-26 19:29:41 +00:00
|
|
|
DAG.getVTList(MVT::Other), Ops,
|
2013-04-01 15:37:53 +00:00
|
|
|
cast<StoreSDNode>(N)->getMemoryVT(),
|
|
|
|
cast<StoreSDNode>(N)->getMemOperand());
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.AddToWorklist(Val.getNode());
|
2006-03-01 05:50:56 +00:00
|
|
|
return Val;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-07-10 20:56:58 +00:00
|
|
|
// Turn STORE (BSWAP) -> sthbrx/stwbrx.
|
2009-09-25 00:57:30 +00:00
|
|
|
if (cast<StoreSDNode>(N)->isUnindexed() &&
|
|
|
|
N->getOperand(1).getOpcode() == ISD::BSWAP &&
|
2008-08-28 21:40:38 +00:00
|
|
|
N->getOperand(1).getNode()->hasOneUse() &&
|
2009-08-11 20:47:22 +00:00
|
|
|
(N->getOperand(1).getValueType() == MVT::i32 ||
|
2013-03-28 19:25:55 +00:00
|
|
|
N->getOperand(1).getValueType() == MVT::i16 ||
|
|
|
|
(TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
|
2013-03-28 20:23:46 +00:00
|
|
|
TM.getSubtarget<PPCSubtarget>().isPPC64() &&
|
2013-03-28 19:25:55 +00:00
|
|
|
N->getOperand(1).getValueType() == MVT::i64))) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue BSwapOp = N->getOperand(1).getOperand(0);
|
2006-07-10 20:56:58 +00:00
|
|
|
// Do an any-extend to 32-bits if this is a half-word input.
|
2009-08-11 20:47:22 +00:00
|
|
|
if (BSwapOp.getValueType() == MVT::i16)
|
|
|
|
BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
|
2006-07-10 20:56:58 +00:00
|
|
|
|
2009-09-25 20:36:54 +00:00
|
|
|
SDValue Ops[] = {
|
|
|
|
N->getOperand(0), BSwapOp, N->getOperand(2),
|
|
|
|
DAG.getValueType(N->getOperand(1).getValueType())
|
|
|
|
};
|
|
|
|
return
|
|
|
|
DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
|
2014-04-26 19:29:41 +00:00
|
|
|
Ops, cast<StoreSDNode>(N)->getMemoryVT(),
|
2009-09-25 20:36:54 +00:00
|
|
|
cast<StoreSDNode>(N)->getMemOperand());
|
2006-07-10 20:56:58 +00:00
|
|
|
}
|
|
|
|
break;
|
2013-05-24 23:00:14 +00:00
|
|
|
case ISD::LOAD: {
|
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(N);
|
|
|
|
EVT VT = LD->getValueType(0);
|
|
|
|
Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
|
|
|
|
unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
|
|
|
|
if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
|
|
|
|
TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
|
2013-09-15 22:09:58 +00:00
|
|
|
(VT == MVT::v16i8 || VT == MVT::v8i16 ||
|
|
|
|
VT == MVT::v4i32 || VT == MVT::v4f32) &&
|
2013-05-24 23:00:14 +00:00
|
|
|
LD->getAlignment() < ABIAlignment) {
|
|
|
|
// This is a type-legal unaligned Altivec load.
|
|
|
|
SDValue Chain = LD->getChain();
|
|
|
|
SDValue Ptr = LD->getBasePtr();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2013-05-24 23:00:14 +00:00
|
|
|
|
|
|
|
// This implements the loading of unaligned vectors as described in
|
|
|
|
// the venerable Apple Velocity Engine overview. Specifically:
|
|
|
|
// https://developer.apple.com/hardwaredrivers/ve/alignment.html
|
|
|
|
// https://developer.apple.com/hardwaredrivers/ve/code_optimization.html
|
|
|
|
//
|
|
|
|
// The general idea is to expand a sequence of one or more unaligned
|
2014-06-09 22:00:52 +00:00
|
|
|
// loads into an alignment-based permutation-control instruction (lvsl
|
|
|
|
// or lvsr), a series of regular vector loads (which always truncate
|
|
|
|
// their input address to an aligned address), and a series of
|
|
|
|
// permutations. The results of these permutations are the requested
|
|
|
|
// loaded values. The trick is that the last "extra" load is not taken
|
|
|
|
// from the address you might suspect (sizeof(vector) bytes after the
|
|
|
|
// last requested load), but rather sizeof(vector) - 1 bytes after the
|
|
|
|
// last requested vector. The point of this is to avoid a page fault if
|
|
|
|
// the base address happened to be aligned. This works because if the
|
|
|
|
// base address is aligned, then adding less than a full vector length
|
|
|
|
// will cause the last vector in the sequence to be (re)loaded.
|
|
|
|
// Otherwise, the next vector will be fetched as you might suspect was
|
|
|
|
// necessary.
|
2013-05-24 23:00:14 +00:00
|
|
|
|
2013-05-25 04:05:05 +00:00
|
|
|
// We might be able to reuse the permutation generation from
|
2013-05-24 23:00:14 +00:00
|
|
|
// a different base address offset from this one by an aligned amount.
|
2013-05-25 04:05:05 +00:00
|
|
|
// The INTRINSIC_WO_CHAIN DAG combine will attempt to perform this
|
|
|
|
// optimization later.
|
2014-06-09 22:00:52 +00:00
|
|
|
Intrinsic::ID Intr = (isLittleEndian ?
|
|
|
|
Intrinsic::ppc_altivec_lvsr :
|
|
|
|
Intrinsic::ppc_altivec_lvsl);
|
|
|
|
SDValue PermCntl = BuildIntrinsicOp(Intr, Ptr, DAG, dl, MVT::v16i8);
|
2013-05-24 23:00:14 +00:00
|
|
|
|
|
|
|
// Refine the alignment of the original load (a "new" load created here
|
|
|
|
// which was identical to the first except for the alignment would be
|
|
|
|
// merged with the existing node regardless).
|
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineMemOperand *MMO =
|
|
|
|
MF.getMachineMemOperand(LD->getPointerInfo(),
|
|
|
|
LD->getMemOperand()->getFlags(),
|
|
|
|
LD->getMemoryVT().getStoreSize(),
|
|
|
|
ABIAlignment);
|
|
|
|
LD->refineAlignment(MMO);
|
|
|
|
SDValue BaseLoad = SDValue(LD, 0);
|
|
|
|
|
|
|
|
// Note that the value of IncOffset (which is provided to the next
|
|
|
|
// load's pointer info offset value, and thus used to calculate the
|
|
|
|
// alignment), and the value of IncValue (which is actually used to
|
|
|
|
// increment the pointer value) are different! This is because we
|
|
|
|
// require the next load to appear to be aligned, even though it
|
|
|
|
// is actually offset from the base pointer by a lesser amount.
|
|
|
|
int IncOffset = VT.getSizeInBits() / 8;
|
Prefer to duplicate PPC Altivec loads when expanding unaligned loads
When expanding unaligned Altivec loads, we use the decremented offset trick to
prevent page faults. Unfortunately, if we have a sequence of consecutive
unaligned loads, this leads to suboptimal code generation because the 'extra'
load from the first unaligned load can be combined with the base load from the
second (but only if the decremented offset trick is not used for the first).
Search up and down the chain, through loads and token factors, looking for
consecutive loads, and if one is found, don't use the offset reduction trick.
These duplicate loads are later combined to yield the desired sequence (in the
future, we might want a more-powerful chain search, but that will require some
changes to allow the combiner routines to access the AA object).
This should complete the initial implementation of the optimized unaligned
Altivec load expansion. There is some refactoring that should be done, but
that will happen when the unaligned store expansion is added.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182719 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-26 18:08:30 +00:00
|
|
|
int IncValue = IncOffset;
|
|
|
|
|
|
|
|
// Walk (both up and down) the chain looking for another load at the real
|
|
|
|
// (aligned) offset (the alignment of the other load does not matter in
|
|
|
|
// this case). If found, then do not use the offset reduction trick, as
|
|
|
|
// that will prevent the loads from being later combined (as they would
|
|
|
|
// otherwise be duplicates).
|
|
|
|
if (!findConsecutiveLoad(LD, DAG))
|
|
|
|
--IncValue;
|
|
|
|
|
2013-05-24 23:00:14 +00:00
|
|
|
SDValue Increment = DAG.getConstant(IncValue, getPointerTy());
|
|
|
|
Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
|
|
|
|
|
|
|
|
SDValue ExtraLoad =
|
|
|
|
DAG.getLoad(VT, dl, Chain, Ptr,
|
|
|
|
LD->getPointerInfo().getWithOffset(IncOffset),
|
|
|
|
LD->isVolatile(), LD->isNonTemporal(),
|
|
|
|
LD->isInvariant(), ABIAlignment);
|
|
|
|
|
|
|
|
SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
|
|
|
|
BaseLoad.getValue(1), ExtraLoad.getValue(1));
|
|
|
|
|
|
|
|
if (BaseLoad.getValueType() != MVT::v4i32)
|
|
|
|
BaseLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, BaseLoad);
|
|
|
|
|
|
|
|
if (ExtraLoad.getValueType() != MVT::v4i32)
|
|
|
|
ExtraLoad = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, ExtraLoad);
|
|
|
|
|
2014-06-09 22:00:52 +00:00
|
|
|
// Because vperm has a big-endian bias, we must reverse the order
|
|
|
|
// of the input vectors and complement the permute control vector
|
|
|
|
// when generating little endian code. We have already handled the
|
|
|
|
// latter by using lvsr instead of lvsl, so just reverse BaseLoad
|
|
|
|
// and ExtraLoad here.
|
|
|
|
SDValue Perm;
|
|
|
|
if (isLittleEndian)
|
|
|
|
Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
|
|
|
|
ExtraLoad, BaseLoad, PermCntl, DAG, dl);
|
|
|
|
else
|
|
|
|
Perm = BuildIntrinsicOp(Intrinsic::ppc_altivec_vperm,
|
|
|
|
BaseLoad, ExtraLoad, PermCntl, DAG, dl);
|
2013-05-24 23:00:14 +00:00
|
|
|
|
|
|
|
if (VT != MVT::v4i32)
|
|
|
|
Perm = DAG.getNode(ISD::BITCAST, dl, VT, Perm);
|
|
|
|
|
|
|
|
// Now we need to be really careful about how we update the users of the
|
|
|
|
// original load. We cannot just call DCI.CombineTo (or
|
|
|
|
// DAG.ReplaceAllUsesWith for that matter), because the load still has
|
|
|
|
// uses created here (the permutation for example) that need to stay.
|
|
|
|
SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
|
|
|
|
while (UI != UE) {
|
|
|
|
SDUse &Use = UI.getUse();
|
|
|
|
SDNode *User = *UI;
|
|
|
|
// Note: BaseLoad is checked here because it might not be N, but a
|
|
|
|
// bitcast of N.
|
|
|
|
if (User == Perm.getNode() || User == BaseLoad.getNode() ||
|
|
|
|
User == TF.getNode() || Use.getResNo() > 1) {
|
|
|
|
++UI;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
SDValue To = Use.getResNo() ? TF : Perm;
|
|
|
|
++UI;
|
|
|
|
|
|
|
|
SmallVector<SDValue, 8> Ops;
|
2014-06-29 00:40:57 +00:00
|
|
|
for (const SDUse &O : User->ops()) {
|
|
|
|
if (O == Use)
|
2013-05-24 23:00:14 +00:00
|
|
|
Ops.push_back(To);
|
|
|
|
else
|
2014-06-29 00:40:57 +00:00
|
|
|
Ops.push_back(O);
|
2013-05-24 23:00:14 +00:00
|
|
|
}
|
|
|
|
|
2014-04-28 05:57:50 +00:00
|
|
|
DAG.UpdateNodeOperands(User, Ops);
|
2013-05-24 23:00:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return SDValue(N, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
2014-06-09 22:00:52 +00:00
|
|
|
case ISD::INTRINSIC_WO_CHAIN: {
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isLittleEndian = Subtarget.isLittleEndian();
|
2014-06-09 22:00:52 +00:00
|
|
|
Intrinsic::ID Intr = (isLittleEndian ?
|
|
|
|
Intrinsic::ppc_altivec_lvsr :
|
|
|
|
Intrinsic::ppc_altivec_lvsl);
|
|
|
|
if (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue() == Intr &&
|
2013-05-25 04:05:05 +00:00
|
|
|
N->getOperand(1)->getOpcode() == ISD::ADD) {
|
|
|
|
SDValue Add = N->getOperand(1);
|
|
|
|
|
|
|
|
if (DAG.MaskedValueIsZero(Add->getOperand(1),
|
|
|
|
APInt::getAllOnesValue(4 /* 16 byte alignment */).zext(
|
|
|
|
Add.getValueType().getScalarType().getSizeInBits()))) {
|
|
|
|
SDNode *BasePtr = Add->getOperand(0).getNode();
|
|
|
|
for (SDNode::use_iterator UI = BasePtr->use_begin(),
|
|
|
|
UE = BasePtr->use_end(); UI != UE; ++UI) {
|
|
|
|
if (UI->getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
|
|
|
|
cast<ConstantSDNode>(UI->getOperand(0))->getZExtValue() ==
|
2014-06-09 22:00:52 +00:00
|
|
|
Intr) {
|
|
|
|
// We've found another LVSL/LVSR, and this address is an aligned
|
2013-05-25 04:05:05 +00:00
|
|
|
// multiple of that one. The results will be the same, so use the
|
|
|
|
// one we've just found instead.
|
|
|
|
|
|
|
|
return SDValue(*UI, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2014-06-09 22:00:52 +00:00
|
|
|
}
|
2013-09-13 20:09:02 +00:00
|
|
|
|
|
|
|
break;
|
2006-07-10 20:56:58 +00:00
|
|
|
case ISD::BSWAP:
|
|
|
|
// Turn BSWAP (LOAD) -> lhbrx/lwbrx.
|
2008-08-28 21:40:38 +00:00
|
|
|
if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
|
2006-07-10 20:56:58 +00:00
|
|
|
N->getOperand(0).hasOneUse() &&
|
2013-03-28 19:25:55 +00:00
|
|
|
(N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16 ||
|
|
|
|
(TM.getSubtarget<PPCSubtarget>().hasLDBRX() &&
|
2013-03-28 20:23:46 +00:00
|
|
|
TM.getSubtarget<PPCSubtarget>().isPPC64() &&
|
2013-03-28 19:25:55 +00:00
|
|
|
N->getValueType(0) == MVT::i64))) {
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Load = N->getOperand(0);
|
2006-10-09 20:57:25 +00:00
|
|
|
LoadSDNode *LD = cast<LoadSDNode>(Load);
|
2006-07-10 20:56:58 +00:00
|
|
|
// Create the byte-swapping load.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Ops[] = {
|
2006-10-09 20:57:25 +00:00
|
|
|
LD->getChain(), // Chain
|
|
|
|
LD->getBasePtr(), // Ptr
|
2006-08-11 17:18:05 +00:00
|
|
|
DAG.getValueType(N->getValueType(0)) // VT
|
|
|
|
};
|
2009-09-25 20:36:54 +00:00
|
|
|
SDValue BSLoad =
|
|
|
|
DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
|
2013-03-28 19:25:55 +00:00
|
|
|
DAG.getVTList(N->getValueType(0) == MVT::i64 ?
|
|
|
|
MVT::i64 : MVT::i32, MVT::Other),
|
2014-04-26 19:29:41 +00:00
|
|
|
Ops, LD->getMemoryVT(), LD->getMemOperand());
|
2006-07-10 20:56:58 +00:00
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
// If this is an i16 load, insert the truncate.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue ResVal = BSLoad;
|
2009-08-11 20:47:22 +00:00
|
|
|
if (N->getValueType(0) == MVT::i16)
|
|
|
|
ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-07-10 20:56:58 +00:00
|
|
|
// First, combine the bswap away. This makes the value produced by the
|
|
|
|
// load dead.
|
|
|
|
DCI.CombineTo(N, ResVal);
|
|
|
|
|
|
|
|
// Next, combine the load away, we give it a bogus result value but a real
|
|
|
|
// chain result. The result value is dead because the bswap is dead.
|
2008-08-28 21:40:38 +00:00
|
|
|
DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-07-10 20:56:58 +00:00
|
|
|
// Return N so it doesn't get rechecked!
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue(N, 0);
|
2006-07-10 20:56:58 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-03-01 05:50:56 +00:00
|
|
|
break;
|
Implement an item from the readme, folding vcmp/vcmp. instructions with
identical instructions into a single instruction. For example, for:
void test(vector float *x, vector float *y, int *P) {
int v = vec_any_out(*x, *y);
*x = (vector float)vec_cmpb(*x, *y);
*P = v;
}
we now generate:
_test:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v0, v1, v0
mfcr r4, 2
stvx v0, 0, r3
rlwinm r3, r4, 27, 31, 31
xori r3, r3, 1
stw r3, 0(r5)
mtspr 256, r2
blr
instead of:
_test:
mfspr r2, 256
oris r6, r2, 57344
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v2, v1, v0
mfcr r4, 2
*** vcmpbfp v0, v1, v0
rlwinm r4, r4, 27, 31, 31
stvx v0, 0, r3
xori r3, r4, 1
stw r3, 0(r5)
mtspr 256, r2
blr
Testcase here: CodeGen/PowerPC/vcmp-fold.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27290 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-31 06:02:07 +00:00
|
|
|
case PPCISD::VCMP: {
|
|
|
|
// If a VCMPo node already exists with exactly the same operands as this
|
|
|
|
// node, use its result instead of this node (VCMPo computes both a CR6 and
|
|
|
|
// a normal output).
|
|
|
|
//
|
|
|
|
if (!N->getOperand(0).hasOneUse() &&
|
|
|
|
!N->getOperand(1).hasOneUse() &&
|
|
|
|
!N->getOperand(2).hasOneUse()) {
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an item from the readme, folding vcmp/vcmp. instructions with
identical instructions into a single instruction. For example, for:
void test(vector float *x, vector float *y, int *P) {
int v = vec_any_out(*x, *y);
*x = (vector float)vec_cmpb(*x, *y);
*P = v;
}
we now generate:
_test:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v0, v1, v0
mfcr r4, 2
stvx v0, 0, r3
rlwinm r3, r4, 27, 31, 31
xori r3, r3, 1
stw r3, 0(r5)
mtspr 256, r2
blr
instead of:
_test:
mfspr r2, 256
oris r6, r2, 57344
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v2, v1, v0
mfcr r4, 2
*** vcmpbfp v0, v1, v0
rlwinm r4, r4, 27, 31, 31
stvx v0, 0, r3
xori r3, r4, 1
stw r3, 0(r5)
mtspr 256, r2
blr
Testcase here: CodeGen/PowerPC/vcmp-fold.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27290 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-31 06:02:07 +00:00
|
|
|
// Scan all of the users of the LHS, looking for VCMPo's that match.
|
2014-04-25 05:30:21 +00:00
|
|
|
SDNode *VCMPoNode = nullptr;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-08-28 21:40:38 +00:00
|
|
|
SDNode *LHSN = N->getOperand(0).getNode();
|
Implement an item from the readme, folding vcmp/vcmp. instructions with
identical instructions into a single instruction. For example, for:
void test(vector float *x, vector float *y, int *P) {
int v = vec_any_out(*x, *y);
*x = (vector float)vec_cmpb(*x, *y);
*P = v;
}
we now generate:
_test:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v0, v1, v0
mfcr r4, 2
stvx v0, 0, r3
rlwinm r3, r4, 27, 31, 31
xori r3, r3, 1
stw r3, 0(r5)
mtspr 256, r2
blr
instead of:
_test:
mfspr r2, 256
oris r6, r2, 57344
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v2, v1, v0
mfcr r4, 2
*** vcmpbfp v0, v1, v0
rlwinm r4, r4, 27, 31, 31
stvx v0, 0, r3
xori r3, r4, 1
stw r3, 0(r5)
mtspr 256, r2
blr
Testcase here: CodeGen/PowerPC/vcmp-fold.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27290 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-31 06:02:07 +00:00
|
|
|
for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
|
|
|
|
UI != E; ++UI)
|
2008-07-27 20:43:25 +00:00
|
|
|
if (UI->getOpcode() == PPCISD::VCMPo &&
|
|
|
|
UI->getOperand(1) == N->getOperand(1) &&
|
|
|
|
UI->getOperand(2) == N->getOperand(2) &&
|
|
|
|
UI->getOperand(0) == N->getOperand(0)) {
|
|
|
|
VCMPoNode = *UI;
|
Implement an item from the readme, folding vcmp/vcmp. instructions with
identical instructions into a single instruction. For example, for:
void test(vector float *x, vector float *y, int *P) {
int v = vec_any_out(*x, *y);
*x = (vector float)vec_cmpb(*x, *y);
*P = v;
}
we now generate:
_test:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v0, v1, v0
mfcr r4, 2
stvx v0, 0, r3
rlwinm r3, r4, 27, 31, 31
xori r3, r3, 1
stw r3, 0(r5)
mtspr 256, r2
blr
instead of:
_test:
mfspr r2, 256
oris r6, r2, 57344
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v2, v1, v0
mfcr r4, 2
*** vcmpbfp v0, v1, v0
rlwinm r4, r4, 27, 31, 31
stvx v0, 0, r3
xori r3, r4, 1
stw r3, 0(r5)
mtspr 256, r2
blr
Testcase here: CodeGen/PowerPC/vcmp-fold.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27290 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-31 06:02:07 +00:00
|
|
|
break;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-04-18 18:28:22 +00:00
|
|
|
// If there is no VCMPo node, or if the flag value has a single use, don't
|
|
|
|
// transform this.
|
|
|
|
if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
|
|
|
|
break;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
// Look at the (necessarily single) use of the flag value. If it has a
|
2006-04-18 18:28:22 +00:00
|
|
|
// chain, this transformation is more complex. Note that multiple things
|
|
|
|
// could use the value result, which we should ignore.
|
2014-04-25 05:30:21 +00:00
|
|
|
SDNode *FlagUser = nullptr;
|
2009-02-17 22:15:04 +00:00
|
|
|
for (SDNode::use_iterator UI = VCMPoNode->use_begin();
|
2014-04-25 05:30:21 +00:00
|
|
|
FlagUser == nullptr; ++UI) {
|
2006-04-18 18:28:22 +00:00
|
|
|
assert(UI != VCMPoNode->use_end() && "Didn't find user!");
|
2008-07-27 20:43:25 +00:00
|
|
|
SDNode *User = *UI;
|
2006-04-18 18:28:22 +00:00
|
|
|
for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
|
2008-07-27 21:46:04 +00:00
|
|
|
if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
|
2006-04-18 18:28:22 +00:00
|
|
|
FlagUser = User;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185556 91177308-0d34-0410-b5e6-96231b3b80d8
2013-07-03 17:05:42 +00:00
|
|
|
// If the user is a MFOCRF instruction, we know this is safe.
|
|
|
|
// Otherwise we give up for right now.
|
|
|
|
if (FlagUser->getOpcode() == PPCISD::MFOCRF)
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue(VCMPoNode, 0);
|
Implement an item from the readme, folding vcmp/vcmp. instructions with
identical instructions into a single instruction. For example, for:
void test(vector float *x, vector float *y, int *P) {
int v = vec_any_out(*x, *y);
*x = (vector float)vec_cmpb(*x, *y);
*P = v;
}
we now generate:
_test:
mfspr r2, 256
oris r6, r2, 49152
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v0, v1, v0
mfcr r4, 2
stvx v0, 0, r3
rlwinm r3, r4, 27, 31, 31
xori r3, r3, 1
stw r3, 0(r5)
mtspr 256, r2
blr
instead of:
_test:
mfspr r2, 256
oris r6, r2, 57344
mtspr 256, r6
lvx v0, 0, r4
lvx v1, 0, r3
vcmpbfp. v2, v1, v0
mfcr r4, 2
*** vcmpbfp v0, v1, v0
rlwinm r4, r4, 27, 31, 31
stvx v0, 0, r3
xori r3, r4, 1
stw r3, 0(r5)
mtspr 256, r2
blr
Testcase here: CodeGen/PowerPC/vcmp-fold.ll
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27290 91177308-0d34-0410-b5e6-96231b3b80d8
2006-03-31 06:02:07 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
Add CR-bit tracking to the PowerPC backend for i1 values
This change enables tracking i1 values in the PowerPC backend using the
condition register bits. These bits can be treated on PowerPC as separate
registers; individual bit operations (and, or, xor, etc.) are supported.
Tracking booleans in CR bits has several advantages:
- Reduction in register pressure (because we no longer need GPRs to store
boolean values).
- Logical operations on booleans can be handled more efficiently; we used to
have to move all results from comparisons into GPRs, perform promoted
logical operations in GPRs, and then move the result back into condition
register bits to be used by conditional branches. This can be very
inefficient, because the throughput of these CR <-> GPR moves have high
latency and low throughput (especially when other associated instructions
are accounted for).
- On the POWER7 and similar cores, we can increase total throughput by using
the CR bits. CR bit operations have a dedicated functional unit.
Most of this is more-or-less mechanical: Adjustments were needed in the
calling-convention code, support was added for spilling/restoring individual
condition-register bits, and conditional branch instruction definitions taking
specific CR bits were added (plus patterns and code for generating bit-level
operations).
This is enabled by default when running at -O2 and higher. For -O0 and -O1,
where the ability to debug is more important, this feature is disabled by
default. Individual CR bits do not have assigned DWARF register numbers,
and storing values in CR bits makes them invisible to the debugger.
It is critical, however, that we don't move i1 values that have been promoted
to larger values (such as those passed as function arguments) into bit
registers only to quickly turn around and move the values back into GPRs (such
as happens when values are returned by functions). A pair of target-specific
DAG combines are added to remove the trunc/extends in:
trunc(binary-ops(binary-ops(zext(x), zext(y)), ...)
and:
zext(binary-ops(binary-ops(trunc(x), trunc(y)), ...)
In short, we only want to use CR bits where some of the i1 values come from
comparisons or are used by conditional branches or selects. To put it another
way, if we can do the entire i1 computation in GPRs, then we probably should
(on the POWER7, the GPR-operation throughput is higher, and for all cores, the
CR <-> GPR moves are expensive).
POWER7 test-suite performance results (from 10 runs in each configuration):
SingleSource/Benchmarks/Misc/mandel-2: 35% speedup
MultiSource/Benchmarks/Prolangs-C++/city/city: 21% speedup
MultiSource/Benchmarks/MiBench/automotive-susan: 23% speedup
SingleSource/Benchmarks/CoyoteBench/huffbench: 13% speedup
SingleSource/Benchmarks/Misc-C++/Large/sphereflake: 13% speedup
SingleSource/Benchmarks/Misc-C++/mandel-text: 10% speedup
SingleSource/Benchmarks/Misc-C++-EH/spirit: 10% slowdown
MultiSource/Applications/lemon/lemon: 8% slowdown
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@202451 91177308-0d34-0410-b5e6-96231b3b80d8
2014-02-28 00:27:01 +00:00
|
|
|
case ISD::BRCOND: {
|
|
|
|
SDValue Cond = N->getOperand(1);
|
|
|
|
SDValue Target = N->getOperand(2);
|
|
|
|
|
|
|
|
if (Cond.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
|
|
cast<ConstantSDNode>(Cond.getOperand(1))->getZExtValue() ==
|
|
|
|
Intrinsic::ppc_is_decremented_ctr_nonzero) {
|
|
|
|
|
|
|
|
// We now need to make the intrinsic dead (it cannot be instruction
|
|
|
|
// selected).
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(Cond.getValue(1), Cond.getOperand(0));
|
|
|
|
assert(Cond.getNode()->hasOneUse() &&
|
|
|
|
"Counter decrement has more than one use");
|
|
|
|
|
|
|
|
return DAG.getNode(PPCISD::BDNZ, dl, MVT::Other,
|
|
|
|
N->getOperand(0), Target);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
case ISD::BR_CC: {
|
|
|
|
// If this is a branch on an altivec predicate comparison, lower this so
|
[PowerPC] Always use mfocrf if available
When accessing just a single CR register, it is always preferable to
use mfocrf instead of mfcr, if the former is available on the CPU.
Current code makes that distinction in many, but not all places
where a single CR register value is retrieved. One missing
location is PPCRegisterInfo::lowerCRSpilling.
To fix this and make this simpler in the future, this patch changes
the bulk of the back-end to always assume mfocrf is available and
simply generate it when needed.
On machines that actually do not support mfocrf, the instruction
is replaced by mfcr at the very end, in EmitInstruction.
This has the additional benefit that we no longer need the
MFCRpseud hack, since before EmitInstruction we always have
a MFOCRF instruction pattern, which already models data flow
as required.
The patch also adds the MFOCRF8 version of the instruction,
which was missing so far.
Except for the PPCRegisterInfo::lowerCRSpilling case, no change
in generated code intended.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185556 91177308-0d34-0410-b5e6-96231b3b80d8
2013-07-03 17:05:42 +00:00
|
|
|
// that we don't have to do a MFOCRF: instead, branch directly on CR6. This
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// lowering is done pre-legalize, because the legalizer lowers the predicate
|
|
|
|
// compare down to code that is difficult to reassemble.
|
|
|
|
ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
|
Implement PPC counter loops as a late IR-level pass
The old PPCCTRLoops pass, like the Hexagon pass version from which it was
derived, could only handle some simple loops in canonical form. We cannot
directly adapt the new Hexagon hardware loops pass, however, because the
Hexagon pass contains a fundamental assumption that non-constant-trip-count
loops will contain a guard, and this is not always true (the result being that
incorrect negative counts can be generated). With this commit, we replace the
pass with a late IR-level pass which makes use of SE to calculate the
backedge-taken counts and safely generate the loop-count expressions (including
any necessary max() parts). This IR level pass inserts custom intrinsics that
are lowered into the desired decrement-and-branch instructions.
The most fragile part of this new implementation is that interfering uses of
the counter register must be detected on the IR level (and, on PPC, this also
includes any indirect branches in addition to function calls). Also, to make
all of this work, we need a variant of the mtctr instruction that is marked
as having side effects. Without this, machine-code level CSE, DCE, etc.
illegally transform the resulting code. Hopefully, this can be improved
in the future.
This new pass is smaller than the original (and much smaller than the new
Hexagon hardware loops pass), and can handle many additional cases correctly.
In addition, the preheader-creation code has been copied from LoopSimplify, and
after we decide on where it belongs, this code will be refactored so that it
can be explicitly shared (making this implementation even smaller).
The new test-case files ctrloop-{le,lt,ne}.ll have been adapted from tests for
the new Hexagon pass. There are a few classes of loops that this pass does not
transform (noted by FIXMEs in the files), but these deficiencies can be
addressed within the SE infrastructure (thus helping many other passes as well).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181927 91177308-0d34-0410-b5e6-96231b3b80d8
2013-05-15 21:37:41 +00:00
|
|
|
|
|
|
|
// Sometimes the promoted value of the intrinsic is ANDed by some non-zero
|
|
|
|
// value. If so, pass-through the AND to get to the intrinsic.
|
|
|
|
if (LHS.getOpcode() == ISD::AND &&
|
|
|
|
LHS.getOperand(0).getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
|
|
cast<ConstantSDNode>(LHS.getOperand(0).getOperand(1))->getZExtValue() ==
|
|
|
|
Intrinsic::ppc_is_decremented_ctr_nonzero &&
|
|
|
|
isa<ConstantSDNode>(LHS.getOperand(1)) &&
|
|
|
|
!cast<ConstantSDNode>(LHS.getOperand(1))->getConstantIntValue()->
|
|
|
|
isZero())
|
|
|
|
LHS = LHS.getOperand(0);
|
|
|
|
|
|
|
|
if (LHS.getOpcode() == ISD::INTRINSIC_W_CHAIN &&
|
|
|
|
cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() ==
|
|
|
|
Intrinsic::ppc_is_decremented_ctr_nonzero &&
|
|
|
|
isa<ConstantSDNode>(RHS)) {
|
|
|
|
assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
|
|
|
"Counter decrement comparison is not EQ or NE");
|
|
|
|
|
|
|
|
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
|
|
|
|
bool isBDNZ = (CC == ISD::SETEQ && Val) ||
|
|
|
|
(CC == ISD::SETNE && !Val);
|
|
|
|
|
|
|
|
// We now need to make the intrinsic dead (it cannot be instruction
|
|
|
|
// selected).
|
|
|
|
DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LHS.getOperand(0));
|
|
|
|
assert(LHS.getNode()->hasOneUse() &&
|
|
|
|
"Counter decrement has more than one use");
|
|
|
|
|
|
|
|
return DAG.getNode(isBDNZ ? PPCISD::BDNZ : PPCISD::BDZ, dl, MVT::Other,
|
|
|
|
N->getOperand(0), N->getOperand(4));
|
|
|
|
}
|
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
int CompareOpc;
|
|
|
|
bool isDot;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
|
|
|
|
isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
|
|
|
|
getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
|
|
|
|
assert(isDot && "Can't compare against a vector result!");
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// If this is a comparison against something other than 0/1, then we know
|
|
|
|
// that the condition is never/always true.
|
2008-09-12 16:56:44 +00:00
|
|
|
unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
if (Val != 0 && Val != 1) {
|
|
|
|
if (CC == ISD::SETEQ) // Cond never true, remove branch.
|
|
|
|
return N->getOperand(0);
|
|
|
|
// Always !=, turn it into an unconditional branch.
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getNode(ISD::BR, dl, MVT::Other,
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
N->getOperand(0), N->getOperand(4));
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// Create the PPCISD altivec 'dot' comparison node.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue Ops[] = {
|
2006-08-11 17:18:05 +00:00
|
|
|
LHS.getOperand(2), // LHS of compare
|
|
|
|
LHS.getOperand(3), // RHS of compare
|
2009-08-11 20:47:22 +00:00
|
|
|
DAG.getConstant(CompareOpc, MVT::i32)
|
2006-08-11 17:18:05 +00:00
|
|
|
};
|
2013-03-07 20:33:29 +00:00
|
|
|
EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
|
2014-04-26 18:35:24 +00:00
|
|
|
SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
|
2009-02-17 22:15:04 +00:00
|
|
|
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
// Unpack the result based on how the target uses it.
|
2006-11-17 22:10:59 +00:00
|
|
|
PPC::Predicate CompOpc;
|
2008-09-12 16:56:44 +00:00
|
|
|
switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
default: // Can't happen, don't crash on invalid number though.
|
|
|
|
case 0: // Branch on the value of the EQ bit of CR6.
|
2006-11-17 22:10:59 +00:00
|
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
break;
|
|
|
|
case 1: // Branch on the inverted value of the EQ bit of CR6.
|
2006-11-17 22:10:59 +00:00
|
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
break;
|
|
|
|
case 2: // Branch on the value of the LT bit of CR6.
|
2006-11-17 22:10:59 +00:00
|
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
break;
|
|
|
|
case 3: // Branch on the inverted value of the LT bit of CR6.
|
2006-11-17 22:10:59 +00:00
|
|
|
CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
2009-08-11 20:47:22 +00:00
|
|
|
return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
|
|
|
|
DAG.getConstant(CompOpc, MVT::i32),
|
|
|
|
DAG.getRegister(PPC::CR6, MVT::i32),
|
Implement an important entry from README_ALTIVEC:
If an altivec predicate compare is used immediately by a branch, don't
use a (serializing) MFCR instruction to read the CR6 register, which requires
a compare to get it back to CR's. Instead, just branch on CR6 directly. :)
For example, for:
void foo2(vector float *A, vector float *B) {
if (!vec_any_eq(*A, *B))
*B = (vector float){0,0,0,0};
}
We now generate:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
bne cr6, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
instead of:
_foo2:
mfspr r2, 256
oris r5, r2, 12288
mtspr 256, r5
lvx v2, 0, r4
lvx v3, 0, r3
vcmpeqfp. v2, v3, v2
mfcr r3, 2
rlwinm r3, r3, 27, 31, 31
cmpwi cr0, r3, 0
beq cr0, LBB1_2 ; UnifiedReturnBlock
LBB1_1: ; cond_true
vxor v2, v2, v2
stvx v2, 0, r4
mtspr 256, r2
blr
LBB1_2: ; UnifiedReturnBlock
mtspr 256, r2
blr
This implements CodeGen/PowerPC/vec_br_cmp.ll.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@27804 91177308-0d34-0410-b5e6-96231b3b80d8
2006-04-18 17:59:36 +00:00
|
|
|
N->getOperand(4), CompNode.getValue(1));
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
2006-03-01 04:57:39 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-07-27 21:46:04 +00:00
|
|
|
return SDValue();
|
2006-03-01 04:57:39 +00:00
|
|
|
}
|
|
|
|
|
2006-04-14 06:01:58 +00:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Inline Assembly Support
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
2014-05-14 21:14:37 +00:00
|
|
|
void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
|
|
|
|
APInt &KnownZero,
|
|
|
|
APInt &KnownOne,
|
|
|
|
const SelectionDAG &DAG,
|
|
|
|
unsigned Depth) const {
|
2012-04-04 12:51:34 +00:00
|
|
|
KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
|
2006-04-02 06:26:07 +00:00
|
|
|
switch (Op.getOpcode()) {
|
|
|
|
default: break;
|
2006-07-10 20:56:58 +00:00
|
|
|
case PPCISD::LBRX: {
|
|
|
|
// lhbrx is known to have the top bits cleared out.
|
2009-09-27 23:17:47 +00:00
|
|
|
if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
|
2006-07-10 20:56:58 +00:00
|
|
|
KnownZero = 0xFFFF0000;
|
|
|
|
break;
|
|
|
|
}
|
2006-04-02 06:26:07 +00:00
|
|
|
case ISD::INTRINSIC_WO_CHAIN: {
|
2008-09-12 16:56:44 +00:00
|
|
|
switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
|
2006-04-02 06:26:07 +00:00
|
|
|
default: break;
|
|
|
|
case Intrinsic::ppc_altivec_vcmpbfp_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpeqfp_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequb_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequh_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpequw_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgefp_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtfp_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsb_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsh_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtsw_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtub_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuh_p:
|
|
|
|
case Intrinsic::ppc_altivec_vcmpgtuw_p:
|
|
|
|
KnownZero = ~1U; // All bits but the low one are known to be zero.
|
|
|
|
break;
|
2009-02-17 22:15:04 +00:00
|
|
|
}
|
2006-04-02 06:26:07 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-03-25 02:14:49 +00:00
|
|
|
/// getConstraintType - Given a constraint, return the type of
|
2006-02-07 20:16:30 +00:00
|
|
|
/// constraint it is for this target.
|
2009-02-17 22:15:04 +00:00
|
|
|
PPCTargetLowering::ConstraintType
|
2007-03-25 02:14:49 +00:00
|
|
|
PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
|
|
|
|
if (Constraint.size() == 1) {
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
default: break;
|
|
|
|
case 'b':
|
|
|
|
case 'r':
|
|
|
|
case 'f':
|
|
|
|
case 'v':
|
|
|
|
case 'y':
|
|
|
|
return C_RegisterClass;
|
2012-11-05 18:18:42 +00:00
|
|
|
case 'Z':
|
|
|
|
// FIXME: While Z does indicate a memory constraint, it specifically
|
|
|
|
// indicates an r+r address (used in conjunction with the 'y' modifier
|
|
|
|
// in the replacement string). Currently, we're forcing the base
|
|
|
|
// register to be r0 in the asm printer (which is interpreted as zero)
|
|
|
|
// and forming the complete address in the second register. This is
|
|
|
|
// suboptimal.
|
|
|
|
return C_Memory;
|
2007-03-25 02:14:49 +00:00
|
|
|
}
|
2014-03-02 18:23:39 +00:00
|
|
|
} else if (Constraint == "wc") { // individual CR bits.
|
|
|
|
return C_RegisterClass;
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
} else if (Constraint == "wa" || Constraint == "wd" ||
|
|
|
|
Constraint == "wf" || Constraint == "ws") {
|
|
|
|
return C_RegisterClass; // VSX registers.
|
2007-03-25 02:14:49 +00:00
|
|
|
}
|
|
|
|
return TargetLowering::getConstraintType(Constraint);
|
2006-02-07 20:16:30 +00:00
|
|
|
}
|
|
|
|
|
2010-10-29 17:29:13 +00:00
|
|
|
/// Examine constraint type and operand type and determine a weight value.
|
|
|
|
/// This object must already have been set up with the operand type
|
|
|
|
/// and the current alternative constraint selected.
|
|
|
|
TargetLowering::ConstraintWeight
|
|
|
|
PPCTargetLowering::getSingleConstraintMatchWeight(
|
|
|
|
AsmOperandInfo &info, const char *constraint) const {
|
|
|
|
ConstraintWeight weight = CW_Invalid;
|
|
|
|
Value *CallOperandVal = info.CallOperandVal;
|
|
|
|
// If we don't have a value, we can't do a match,
|
|
|
|
// but allow it at the lowest weight.
|
2014-04-25 05:30:21 +00:00
|
|
|
if (!CallOperandVal)
|
2010-10-29 17:29:13 +00:00
|
|
|
return CW_Default;
|
2011-07-18 04:54:35 +00:00
|
|
|
Type *type = CallOperandVal->getType();
|
2014-03-02 18:23:39 +00:00
|
|
|
|
2010-10-29 17:29:13 +00:00
|
|
|
// Look at the constraint type.
|
2014-03-02 18:23:39 +00:00
|
|
|
if (StringRef(constraint) == "wc" && type->isIntegerTy(1))
|
|
|
|
return CW_Register; // an individual CR bit.
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
else if ((StringRef(constraint) == "wa" ||
|
|
|
|
StringRef(constraint) == "wd" ||
|
|
|
|
StringRef(constraint) == "wf") &&
|
|
|
|
type->isVectorTy())
|
|
|
|
return CW_Register;
|
|
|
|
else if (StringRef(constraint) == "ws" && type->isDoubleTy())
|
|
|
|
return CW_Register;
|
2014-03-02 18:23:39 +00:00
|
|
|
|
2010-10-29 17:29:13 +00:00
|
|
|
switch (*constraint) {
|
|
|
|
default:
|
|
|
|
weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
|
|
|
|
break;
|
|
|
|
case 'b':
|
|
|
|
if (type->isIntegerTy())
|
|
|
|
weight = CW_Register;
|
|
|
|
break;
|
|
|
|
case 'f':
|
|
|
|
if (type->isFloatTy())
|
|
|
|
weight = CW_Register;
|
|
|
|
break;
|
|
|
|
case 'd':
|
|
|
|
if (type->isDoubleTy())
|
|
|
|
weight = CW_Register;
|
|
|
|
break;
|
|
|
|
case 'v':
|
|
|
|
if (type->isVectorTy())
|
|
|
|
weight = CW_Register;
|
|
|
|
break;
|
|
|
|
case 'y':
|
|
|
|
weight = CW_Register;
|
|
|
|
break;
|
2012-11-05 18:18:42 +00:00
|
|
|
case 'Z':
|
|
|
|
weight = CW_Memory;
|
|
|
|
break;
|
2010-10-29 17:29:13 +00:00
|
|
|
}
|
|
|
|
return weight;
|
|
|
|
}
|
|
|
|
|
2009-02-17 22:15:04 +00:00
|
|
|
std::pair<unsigned, const TargetRegisterClass*>
|
2006-11-02 01:44:04 +00:00
|
|
|
PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
|
2013-06-22 18:37:38 +00:00
|
|
|
MVT VT) const {
|
2006-01-31 19:20:21 +00:00
|
|
|
if (Constraint.size() == 1) {
|
2006-11-02 01:44:04 +00:00
|
|
|
// GCC RS6000 Constraint Letters
|
|
|
|
switch (Constraint[0]) {
|
|
|
|
case 'b': // R1-R31
|
2014-06-12 22:38:18 +00:00
|
|
|
if (VT == MVT::i64 && Subtarget.isPPC64())
|
Prepare to make r0 an allocatable register on PPC
Currently the PPC r0 register is unconditionally reserved. There are two reasons
for this:
1. r0 is treated specially (as the constant 0) by certain instructions, and so
cannot be used with those instructions as a regular register.
2. r0 is used as a temporary register in the CR-register spilling process
(where, under some circumstances, we require two GPRs).
This change addresses the first reason by introducing a restricted register
class (without r0) for use by those instructions that treat r0 specially. These
register classes have a new pseudo-register, ZERO, which represents the r0-as-0
use. This has the side benefit of making the existing target code simpler (and
easier to understand), and will make it clear to the register allocator that
uses of r0 as 0 don't conflict will real uses of the r0 register.
Once the CR spilling code is improved, we'll be able to allocate r0.
Adding these extra register classes, for some reason unclear to me, causes
requests to the target to copy 32-bit registers to 64-bit registers. The
resulting code seems correct (and causes no test-suite failures), and the new
test case covers this new kind of asymmetric copy.
As r0 is still reserved, no functionality change intended.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177423 91177308-0d34-0410-b5e6-96231b3b80d8
2013-03-19 18:51:05 +00:00
|
|
|
return std::make_pair(0U, &PPC::G8RC_NOX0RegClass);
|
|
|
|
return std::make_pair(0U, &PPC::GPRC_NOR0RegClass);
|
2006-11-02 01:44:04 +00:00
|
|
|
case 'r': // R0-R31
|
2014-06-12 22:38:18 +00:00
|
|
|
if (VT == MVT::i64 && Subtarget.isPPC64())
|
2012-04-20 06:31:50 +00:00
|
|
|
return std::make_pair(0U, &PPC::G8RCRegClass);
|
|
|
|
return std::make_pair(0U, &PPC::GPRCRegClass);
|
2006-11-02 01:44:04 +00:00
|
|
|
case 'f':
|
2012-10-29 17:49:34 +00:00
|
|
|
if (VT == MVT::f32 || VT == MVT::i32)
|
2012-04-20 06:31:50 +00:00
|
|
|
return std::make_pair(0U, &PPC::F4RCRegClass);
|
2012-10-29 17:49:34 +00:00
|
|
|
if (VT == MVT::f64 || VT == MVT::i64)
|
2012-04-20 06:31:50 +00:00
|
|
|
return std::make_pair(0U, &PPC::F8RCRegClass);
|
2006-11-02 01:44:04 +00:00
|
|
|
break;
|
2009-02-17 22:15:04 +00:00
|
|
|
case 'v':
|
2012-04-20 06:31:50 +00:00
|
|
|
return std::make_pair(0U, &PPC::VRRCRegClass);
|
2006-11-02 01:44:04 +00:00
|
|
|
case 'y': // crrc
|
2012-04-20 06:31:50 +00:00
|
|
|
return std::make_pair(0U, &PPC::CRRCRegClass);
|
2006-01-31 19:20:21 +00:00
|
|
|
}
|
2014-03-02 18:23:39 +00:00
|
|
|
} else if (Constraint == "wc") { // an individual CR bit.
|
|
|
|
return std::make_pair(0U, &PPC::CRBITRCRegClass);
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
} else if (Constraint == "wa" || Constraint == "wd" ||
|
2014-03-29 05:29:01 +00:00
|
|
|
Constraint == "wf") {
|
[PowerPC] Initial support for the VSX instruction set
VSX is an ISA extension supported on the POWER7 and later cores that enhances
floating-point vector and scalar capabilities. Among other things, this adds
<2 x double> support and generally helps to reduce register pressure.
The interesting part of this ISA feature is the register configuration: there
are 64 new 128-bit vector registers, the 32 of which are super-registers of the
existing 32 scalar floating-point registers, and the second 32 of which overlap
with the 32 Altivec vector registers. This makes things like vector insertion
and extraction tricky: this can be free but only if we force a restriction to
the right register subclass when needed. A new "minipass" PPCVSXCopy takes care
of this (although it could do a more-optimal job of it; see the comment about
unnecessary copies below).
Please note that, currently, VSX is not enabled by default when targeting
anything because it is not yet ready for that. The assembler and disassembler
are fully implemented and tested. However:
- CodeGen support causes miscompiles; test-suite runtime failures:
MultiSource/Benchmarks/FreeBench/distray/distray
MultiSource/Benchmarks/McCat/08-main/main
MultiSource/Benchmarks/Olden/voronoi/voronoi
MultiSource/Benchmarks/mafft/pairlocalalign
MultiSource/Benchmarks/tramp3d-v4/tramp3d-v4
SingleSource/Benchmarks/CoyoteBench/almabench
SingleSource/Benchmarks/Misc/matmul_f64_4x4
- The lowering currently falls back to using Altivec instructions far more
than it should. Worse, there are some things that are scalarized through the
stack that shouldn't be.
- A lot of unnecessary copies make it past the optimizers, and this needs to
be fixed.
- Many more regression tests are needed.
Normally, I'd fix these things prior to committing, but there are some
students and other contributors who would like to work this, and so it makes
sense to move this development process upstream where it can be subject to the
regular code-review procedures.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@203768 91177308-0d34-0410-b5e6-96231b3b80d8
2014-03-13 07:58:58 +00:00
|
|
|
return std::make_pair(0U, &PPC::VSRCRegClass);
|
2014-03-29 05:29:01 +00:00
|
|
|
} else if (Constraint == "ws") {
|
|
|
|
return std::make_pair(0U, &PPC::VSFRCRegClass);
|
2006-01-31 19:20:21 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2013-08-03 12:25:10 +00:00
|
|
|
std::pair<unsigned, const TargetRegisterClass*> R =
|
|
|
|
TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
|
|
|
|
|
|
|
|
// r[0-9]+ are used, on PPC64, to refer to the corresponding 64-bit registers
|
|
|
|
// (which we call X[0-9]+). If a 64-bit value has been requested, and a
|
|
|
|
// 32-bit GPR has been selected, then 'upgrade' it to the 64-bit parent
|
|
|
|
// register.
|
|
|
|
// FIXME: If TargetLowering::getRegForInlineAsmConstraint could somehow use
|
|
|
|
// the AsmName field from *RegisterInfo.td, then this would not be necessary.
|
2014-06-12 22:38:18 +00:00
|
|
|
if (R.first && VT == MVT::i64 && Subtarget.isPPC64() &&
|
2013-08-03 12:25:10 +00:00
|
|
|
PPC::GPRCRegClass.contains(R.first)) {
|
|
|
|
const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
|
|
|
|
return std::make_pair(TRI->getMatchingSuperReg(R.first,
|
2013-08-14 20:05:04 +00:00
|
|
|
PPC::sub_32, &PPC::G8RCRegClass),
|
2013-08-03 12:25:10 +00:00
|
|
|
&PPC::G8RCRegClass);
|
|
|
|
}
|
|
|
|
|
|
|
|
return R;
|
2006-01-31 19:20:21 +00:00
|
|
|
}
|
2006-02-07 00:47:13 +00:00
|
|
|
|
2006-11-02 01:44:04 +00:00
|
|
|
|
2007-08-25 00:47:38 +00:00
|
|
|
/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
|
2010-06-25 21:55:36 +00:00
|
|
|
/// vector. If it is invalid, don't add anything to Ops.
|
2011-06-08 23:55:35 +00:00
|
|
|
void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
|
2011-06-02 23:16:42 +00:00
|
|
|
std::string &Constraint,
|
2008-07-27 21:46:04 +00:00
|
|
|
std::vector<SDValue>&Ops,
|
2008-04-26 23:02:14 +00:00
|
|
|
SelectionDAG &DAG) const {
|
2014-04-25 05:30:21 +00:00
|
|
|
SDValue Result;
|
2011-06-08 23:55:35 +00:00
|
|
|
|
2011-06-02 23:16:42 +00:00
|
|
|
// Only support length 1 constraints.
|
|
|
|
if (Constraint.length() > 1) return;
|
2011-06-08 23:55:35 +00:00
|
|
|
|
2011-06-02 23:16:42 +00:00
|
|
|
char Letter = Constraint[0];
|
2006-02-07 00:47:13 +00:00
|
|
|
switch (Letter) {
|
|
|
|
default: break;
|
|
|
|
case 'I':
|
|
|
|
case 'J':
|
|
|
|
case 'K':
|
|
|
|
case 'L':
|
|
|
|
case 'M':
|
|
|
|
case 'N':
|
|
|
|
case 'O':
|
|
|
|
case 'P': {
|
2007-05-15 01:31:05 +00:00
|
|
|
ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
|
2007-08-25 00:47:38 +00:00
|
|
|
if (!CST) return; // Must be an immediate to match.
|
2008-09-12 16:56:44 +00:00
|
|
|
unsigned Value = CST->getZExtValue();
|
2006-02-07 00:47:13 +00:00
|
|
|
switch (Letter) {
|
2009-07-14 16:55:14 +00:00
|
|
|
default: llvm_unreachable("Unknown constraint letter!");
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'I': // "I" is a signed 16-bit constant.
|
2007-05-15 01:31:05 +00:00
|
|
|
if ((short)Value == (int)Value)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'J': // "J" is a constant with only the high-order 16 bits nonzero.
|
|
|
|
case 'L': // "L" is a signed 16-bit constant shifted left 16 bits.
|
2007-05-15 01:31:05 +00:00
|
|
|
if ((short)Value == 0)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'K': // "K" is a constant with only the low-order 16 bits nonzero.
|
2007-05-15 01:31:05 +00:00
|
|
|
if ((Value >> 16) == 0)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'M': // "M" is a constant that is greater than 31.
|
2007-05-15 01:31:05 +00:00
|
|
|
if (Value > 31)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'N': // "N" is a positive constant that is an exact power of two.
|
2007-05-15 01:31:05 +00:00
|
|
|
if ((int)Value > 0 && isPowerOf2_32(Value))
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2009-02-17 22:15:04 +00:00
|
|
|
case 'O': // "O" is the constant zero.
|
2007-05-15 01:31:05 +00:00
|
|
|
if (Value == 0)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
case 'P': // "P" is a constant whose negation is a signed 16-bit constant.
|
2007-05-15 01:31:05 +00:00
|
|
|
if ((short)-Value == (int)-Value)
|
2007-08-25 00:47:38 +00:00
|
|
|
Result = DAG.getTargetConstant(Value, Op.getValueType());
|
2006-10-31 19:40:43 +00:00
|
|
|
break;
|
2006-02-07 00:47:13 +00:00
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2008-08-28 21:40:38 +00:00
|
|
|
if (Result.getNode()) {
|
2007-08-25 00:47:38 +00:00
|
|
|
Ops.push_back(Result);
|
|
|
|
return;
|
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2006-02-07 00:47:13 +00:00
|
|
|
// Handle standard constraint letters.
|
2011-06-02 23:16:42 +00:00
|
|
|
TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
|
2006-02-07 00:47:13 +00:00
|
|
|
}
|
2006-03-13 23:20:37 +00:00
|
|
|
|
2007-03-30 23:15:24 +00:00
|
|
|
// isLegalAddressingMode - Return true if the addressing mode represented
|
|
|
|
// by AM is legal for this target, for a load/store of the specified type.
|
2009-02-17 22:15:04 +00:00
|
|
|
bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
|
2011-07-18 04:54:35 +00:00
|
|
|
Type *Ty) const {
|
2007-03-30 23:15:24 +00:00
|
|
|
// FIXME: PPC does not allow r+i addressing modes for vectors!
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-03-30 23:15:24 +00:00
|
|
|
// PPC allows a sign-extended 16-bit immediate field.
|
|
|
|
if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
|
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-03-30 23:15:24 +00:00
|
|
|
// No global is ever allowed as a base.
|
|
|
|
if (AM.BaseGV)
|
|
|
|
return false;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
|
|
|
// PPC only support r+r,
|
2007-03-30 23:15:24 +00:00
|
|
|
switch (AM.Scale) {
|
|
|
|
case 0: // "r+i" or just "i", depending on HasBaseReg.
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
if (AM.HasBaseReg && AM.BaseOffs) // "r+r+i" is not allowed.
|
|
|
|
return false;
|
|
|
|
// Otherwise we have r+r or r+i.
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
if (AM.HasBaseReg || AM.BaseOffs) // 2*r+r or 2*r+i is not allowed.
|
|
|
|
return false;
|
|
|
|
// Allow 2*r as r+r.
|
|
|
|
break;
|
2007-04-09 22:10:05 +00:00
|
|
|
default:
|
|
|
|
// No other scales are supported.
|
|
|
|
return false;
|
2007-03-30 23:15:24 +00:00
|
|
|
}
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-03-30 23:15:24 +00:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2010-05-22 01:47:14 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
|
|
|
MFI->setReturnAddressIsTaken(true);
|
|
|
|
|
2014-01-06 00:43:20 +00:00
|
|
|
if (verifyReturnAddressArgumentIsConstant(Op, DAG))
|
2014-01-05 01:47:20 +00:00
|
|
|
return SDValue();
|
|
|
|
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2010-05-03 22:59:34 +00:00
|
|
|
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
2007-12-08 06:59:59 +00:00
|
|
|
|
2010-05-03 22:59:34 +00:00
|
|
|
// Make sure the function does not optimize away the store of the RA to
|
|
|
|
// the stack.
|
2007-12-08 06:59:59 +00:00
|
|
|
PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
|
2010-05-03 22:59:34 +00:00
|
|
|
FuncInfo->setLRStoreRequired();
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
bool isDarwinABI = Subtarget.isDarwinABI();
|
2010-05-03 22:59:34 +00:00
|
|
|
|
|
|
|
if (Depth > 0) {
|
|
|
|
SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
|
|
|
|
SDValue Offset =
|
2010-11-23 03:31:01 +00:00
|
|
|
|
2011-01-10 12:39:04 +00:00
|
|
|
DAG.getConstant(PPCFrameLowering::getReturnSaveOffset(isPPC64, isDarwinABI),
|
2010-05-03 22:59:34 +00:00
|
|
|
isPPC64? MVT::i64 : MVT::i32);
|
|
|
|
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
|
|
|
|
DAG.getNode(ISD::ADD, dl, getPointerTy(),
|
|
|
|
FrameAddr, Offset),
|
2011-11-08 18:42:53 +00:00
|
|
|
MachinePointerInfo(), false, false, false, 0);
|
2010-05-03 22:59:34 +00:00
|
|
|
}
|
2008-04-30 09:16:33 +00:00
|
|
|
|
2007-12-08 06:59:59 +00:00
|
|
|
// Just load the return address off the stack.
|
2008-07-27 21:46:04 +00:00
|
|
|
SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
|
2010-05-03 22:59:34 +00:00
|
|
|
return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
|
2011-11-08 18:42:53 +00:00
|
|
|
RetAddrFI, MachinePointerInfo(), false, false, false, 0);
|
2007-12-08 06:59:59 +00:00
|
|
|
}
|
|
|
|
|
2010-04-17 15:26:15 +00:00
|
|
|
SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
|
|
|
|
SelectionDAG &DAG) const {
|
2013-05-25 02:42:55 +00:00
|
|
|
SDLoc dl(Op);
|
2010-05-03 22:59:34 +00:00
|
|
|
unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2009-08-10 22:56:29 +00:00
|
|
|
EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
|
2009-08-11 20:47:22 +00:00
|
|
|
bool isPPC64 = PtrVT == MVT::i64;
|
2009-02-17 22:15:04 +00:00
|
|
|
|
2007-03-01 13:11:38 +00:00
|
|
|
MachineFunction &MF = DAG.getMachineFunction();
|
|
|
|
MachineFrameInfo *MFI = MF.getFrameInfo();
|
2010-05-03 22:59:34 +00:00
|
|
|
MFI->setFrameAddressIsTaken(true);
|
2013-03-21 19:03:19 +00:00
|
|
|
|
|
|
|
// Naked functions never have a frame pointer, and so we use r1. For all
|
|
|
|
// other functions, this decision must be delayed until during PEI.
|
|
|
|
unsigned FrameReg;
|
|
|
|
if (MF.getFunction()->getAttributes().hasAttribute(
|
|
|
|
AttributeSet::FunctionIndex, Attribute::Naked))
|
|
|
|
FrameReg = isPPC64 ? PPC::X1 : PPC::R1;
|
|
|
|
else
|
|
|
|
FrameReg = isPPC64 ? PPC::FP8 : PPC::FP;
|
|
|
|
|
2010-05-03 22:59:34 +00:00
|
|
|
SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg,
|
|
|
|
PtrVT);
|
|
|
|
while (Depth--)
|
|
|
|
FrameAddr = DAG.getLoad(Op.getValueType(), dl, DAG.getEntryNode(),
|
2011-11-08 18:42:53 +00:00
|
|
|
FrameAddr, MachinePointerInfo(), false, false,
|
|
|
|
false, 0);
|
2010-05-03 22:59:34 +00:00
|
|
|
return FrameAddr;
|
2007-03-01 13:11:38 +00:00
|
|
|
}
|
2008-10-21 03:41:46 +00:00
|
|
|
|
2014-05-11 19:29:11 +00:00
|
|
|
// FIXME? Maybe this could be a TableGen attribute on some registers and
|
|
|
|
// this table could be generated automatically from RegInfo.
|
|
|
|
unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
|
|
|
|
EVT VT) const {
|
2014-06-12 22:38:18 +00:00
|
|
|
bool isPPC64 = Subtarget.isPPC64();
|
|
|
|
bool isDarwinABI = Subtarget.isDarwinABI();
|
2014-05-11 19:29:11 +00:00
|
|
|
|
|
|
|
if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
|
|
|
|
(!isPPC64 && VT != MVT::i32))
|
|
|
|
report_fatal_error("Invalid register global variable type");
|
|
|
|
|
|
|
|
bool is64Bit = isPPC64 && VT == MVT::i64;
|
|
|
|
unsigned Reg = StringSwitch<unsigned>(RegName)
|
|
|
|
.Case("r1", is64Bit ? PPC::X1 : PPC::R1)
|
|
|
|
.Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2))
|
|
|
|
.Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
|
|
|
|
(is64Bit ? PPC::X13 : PPC::R13))
|
|
|
|
.Default(0);
|
|
|
|
|
|
|
|
if (Reg)
|
|
|
|
return Reg;
|
|
|
|
report_fatal_error("Invalid register name global variable");
|
|
|
|
}
|
|
|
|
|
2008-10-21 03:41:46 +00:00
|
|
|
bool
|
|
|
|
PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
|
|
|
|
// The PowerPC target isn't yet aware of offsets.
|
|
|
|
return false;
|
|
|
|
}
|
2009-07-03 06:45:56 +00:00
|
|
|
|
2010-04-01 20:10:42 +00:00
|
|
|
/// getOptimalMemOpType - Returns the target specific optimal type for load
|
2010-04-02 19:36:14 +00:00
|
|
|
/// and store operations as a result of memset, memcpy, and memmove
|
|
|
|
/// lowering. If DstAlign is zero that means it's safe to destination
|
|
|
|
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
|
|
|
|
/// means there isn't a need to check it against alignment requirement,
|
2012-12-12 02:34:41 +00:00
|
|
|
/// probably because the source does not need to be loaded. If 'IsMemset' is
|
|
|
|
/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
|
|
|
|
/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
|
|
|
|
/// source is constant so it does not need to be loaded.
|
2010-04-16 20:11:05 +00:00
|
|
|
/// It returns EVT::Other if the type should be determined using generic
|
|
|
|
/// target-independent logic.
|
2010-04-01 06:04:33 +00:00
|
|
|
EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
|
|
|
|
unsigned DstAlign, unsigned SrcAlign,
|
2012-12-12 02:34:41 +00:00
|
|
|
bool IsMemset, bool ZeroMemset,
|
2010-04-08 07:37:57 +00:00
|
|
|
bool MemcpyStrSrc,
|
2010-04-16 20:11:05 +00:00
|
|
|
MachineFunction &MF) const {
|
2014-06-12 22:38:20 +00:00
|
|
|
if (Subtarget.isPPC64()) {
|
2009-08-11 20:47:22 +00:00
|
|
|
return MVT::i64;
|
2009-07-03 06:45:56 +00:00
|
|
|
} else {
|
2009-08-11 20:47:22 +00:00
|
|
|
return MVT::i32;
|
2009-07-03 06:45:56 +00:00
|
|
|
}
|
|
|
|
}
|
2012-04-01 19:23:08 +00:00
|
|
|
|
2014-04-12 21:52:38 +00:00
|
|
|
/// \brief Returns true if it is beneficial to convert a load of a constant
|
|
|
|
/// to just the constant itself.
|
|
|
|
bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
|
|
|
|
Type *Ty) const {
|
|
|
|
assert(Ty->isIntegerTy());
|
|
|
|
|
|
|
|
unsigned BitSize = Ty->getPrimitiveSizeInBits();
|
|
|
|
if (BitSize == 0 || BitSize > 64)
|
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
|
|
|
|
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
|
|
|
|
return false;
|
|
|
|
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
|
|
|
|
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
|
|
|
|
return NumBits1 == 64 && NumBits2 == 32;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
|
|
|
|
if (!VT1.isInteger() || !VT2.isInteger())
|
|
|
|
return false;
|
|
|
|
unsigned NumBits1 = VT1.getSizeInBits();
|
|
|
|
unsigned NumBits2 = VT2.getSizeInBits();
|
|
|
|
return NumBits1 == 64 && NumBits2 == 32;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
|
|
|
|
return isInt<16>(Imm) || isUInt<16>(Imm);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
|
|
|
|
return isInt<16>(Imm) || isUInt<16>(Imm);
|
|
|
|
}
|
|
|
|
|
2013-03-15 15:27:13 +00:00
|
|
|
bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
|
2014-02-05 23:15:53 +00:00
|
|
|
unsigned,
|
2013-03-15 15:27:13 +00:00
|
|
|
bool *Fast) const {
|
|
|
|
if (DisablePPCUnaligned)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// PowerPC supports unaligned memory access for simple non-vector types.
|
|
|
|
// Although accessing unaligned addresses is not as efficient as accessing
|
|
|
|
// aligned addresses, it is generally more efficient than manual expansion,
|
|
|
|
// and generally only traps for software emulation when crossing page
|
|
|
|
// boundaries.
|
|
|
|
|
|
|
|
if (!VT.isSimple())
|
|
|
|
return false;
|
|
|
|
|
2014-03-26 19:39:09 +00:00
|
|
|
if (VT.getSimpleVT().isVector()) {
|
2014-06-12 22:38:18 +00:00
|
|
|
if (Subtarget.hasVSX()) {
|
2014-03-26 19:39:09 +00:00
|
|
|
if (VT != MVT::v2f64 && VT != MVT::v2i64)
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
2013-03-15 15:27:13 +00:00
|
|
|
|
|
|
|
if (VT == MVT::ppcf128)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (Fast)
|
|
|
|
*Fast = true;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
AArch64/PowerPC/SystemZ/X86: This patch fixes the interface, usage, and all
in-tree implementations of TargetLoweringBase::isFMAFasterThanMulAndAdd in
order to resolve the following issues with fmuladd (i.e. optional FMA)
intrinsics:
1. On X86(-64) targets, ISD::FMA nodes are formed when lowering fmuladd
intrinsics even if the subtarget does not support FMA instructions, leading
to laughably bad code generation in some situations.
2. On AArch64 targets, ISD::FMA nodes are formed for operations on fp128,
resulting in a call to a software fp128 FMA implementation.
3. On PowerPC targets, FMAs are not generated from fmuladd intrinsics on types
like v2f32, v8f32, v4f64, etc., even though they promote, split, scalarize,
etc. to types that support hardware FMAs.
The function has also been slightly renamed for consistency and to force a
merge/build conflict for any out-of-tree target implementing it. To resolve,
see comments and fixed in-tree examples.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@185956 91177308-0d34-0410-b5e6-96231b3b80d8
2013-07-09 18:16:56 +00:00
|
|
|
bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
|
|
|
|
VT = VT.getScalarType();
|
|
|
|
|
2012-06-22 00:49:52 +00:00
|
|
|
if (!VT.isSimple())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
switch (VT.getSimpleVT().SimpleTy) {
|
|
|
|
case MVT::f32:
|
|
|
|
case MVT::f64:
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2014-03-31 17:48:16 +00:00
|
|
|
bool
|
|
|
|
PPCTargetLowering::shouldExpandBuildVectorWithShuffles(
|
|
|
|
EVT VT , unsigned DefinedValues) const {
|
|
|
|
if (VT == MVT::v2i64)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return TargetLowering::shouldExpandBuildVectorWithShuffles(VT, DefinedValues);
|
|
|
|
}
|
|
|
|
|
2012-04-01 19:23:08 +00:00
|
|
|
Sched::Preference PPCTargetLowering::getSchedulingPreference(SDNode *N) const {
|
2014-06-12 22:38:18 +00:00
|
|
|
if (DisableILPPref || Subtarget.enableMachineScheduler())
|
2012-06-10 19:32:29 +00:00
|
|
|
return TargetLowering::getSchedulingPreference(N);
|
2012-04-01 19:23:08 +00:00
|
|
|
|
2012-06-10 19:32:29 +00:00
|
|
|
return Sched::ILP;
|
2012-04-01 19:23:08 +00:00
|
|
|
}
|
|
|
|
|
2013-07-30 00:50:39 +00:00
|
|
|
// Create a fast isel object.
|
|
|
|
FastISel *
|
|
|
|
PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
|
|
|
|
const TargetLibraryInfo *LibInfo) const {
|
|
|
|
return PPC::createFastISel(FuncInfo, LibInfo);
|
|
|
|
}
|