[AVX] Fix mask predicates for 256-bit UNPCKLPS/D and implement

missing patterns for them.

      Add a SIMD test subdirectory to hold tests for SIMD instruction
      selection correctness and quality.
'


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@126845 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
David Greene 2011-03-02 17:23:43 +00:00
parent 31c488c8bd
commit a20244d1ba
9 changed files with 161 additions and 22 deletions

View File

@ -165,12 +165,25 @@ void DecodeUNPCKLPDMask(unsigned NElts,
/// datatypes and vector widths.
void DecodeUNPCKLPMask(EVT VT,
SmallVectorImpl<unsigned> &ShuffleMask) {
unsigned NumElts = VT.getVectorNumElements();
unsigned NElts = VT.getVectorNumElements();
// Handle vector lengths > 128 bits. Define a "section" as a set of
// 128 bits. AVX defines UNPCK* to operate independently on 128-bit
// sections.
unsigned NumSections = VT.getSizeInBits() / 128;
if (NumSections == 0 ) NumSections = 1; // Handle MMX
unsigned NumSectionElts = NumElts / NumSections;
for (unsigned i = 0; i != NElts/2; ++i) {
ShuffleMask.push_back(i); // Reads from dest
ShuffleMask.push_back(i+NElts); // Reads from src
unsigned Start = 0;
unsigned End = NumSectionElts / 2;
for (unsigned s = 0; s < NumSections; ++s) {
for (unsigned i = Start; i != End; ++i) {
ShuffleMask.push_back(i); // Reads from dest/src1
ShuffleMask.push_back(i+NumSectionElts); // Reads from src/src2
}
// Process the next 128 bits.
Start += NumSectionElts;
End += NumSectionElts;
}
}

View File

@ -3173,7 +3173,8 @@ bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
unsigned NumElems = N->getValueType(0).getVectorNumElements();
if (NumElems != 2 && NumElems != 4)
if ((NumElems != 2 && NumElems != 4)
|| N->getValueType(0).getSizeInBits() > 128)
return false;
for (unsigned i = 0; i < NumElems/2; ++i)
@ -3195,19 +3196,36 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
return false;
for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
if (!isUndefOrEqual(BitI1, NumElts))
return false;
} else {
if (!isUndefOrEqual(BitI1, j + NumElts))
// Handle vector lengths > 128 bits. Define a "section" as a set of
// 128 bits. AVX defines UNPCK* to operate independently on 128-bit
// sections.
unsigned NumSections = VT.getSizeInBits() / 128;
if (NumSections == 0 ) NumSections = 1; // Handle MMX
unsigned NumSectionElts = NumElts / NumSections;
unsigned Start = 0;
unsigned End = NumSectionElts;
for (unsigned s = 0; s < NumSections; ++s) {
for (unsigned i = Start, j = s * NumSectionElts;
i != End;
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (V2IsSplat) {
if (!isUndefOrEqual(BitI1, NumElts))
return false;
} else {
if (!isUndefOrEqual(BitI1, j + NumElts))
return false;
}
}
// Process the next 128 bits.
Start += NumSectionElts;
End += NumSectionElts;
}
return true;
}
@ -3255,14 +3273,27 @@ static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
return false;
for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (!isUndefOrEqual(BitI1, j))
return false;
// Handle vector lengths > 128 bits. Define a "section" as a set of
// 128 bits. AVX defines UNPCK* to operate independently on 128-bit
// sections.
unsigned NumSections = VT.getSizeInBits() / 128;
if (NumSections == 0 ) NumSections = 1; // Handle MMX
unsigned NumSectionElts = NumElems / NumSections;
for (unsigned s = 0; s < NumSections; ++s) {
for (unsigned i = s * NumSectionElts, j = s * NumSectionElts;
i != NumSectionElts * (s + 1);
i += 2, ++j) {
int BitI = Mask[i];
int BitI1 = Mask[i+1];
if (!isUndefOrEqual(BitI, j))
return false;
if (!isUndefOrEqual(BitI1, j))
return false;
}
}
return true;
}

View File

@ -132,6 +132,8 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
def X86Unpcklps : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
def X86Unpcklpd : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
def X86Unpckhps : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
def X86Unpckhpd : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;

View File

@ -5622,11 +5622,15 @@ def : Pat<(X86Movddup (bc_v2f64
// Shuffle with UNPCKLPS
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
(VUNPCKLPSrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
(VUNPCKLPSYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, (memopv4f32 addr:$src2))),
(UNPCKLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
(VUNPCKLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
(VUNPCKLPSYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
def : Pat<(v4f32 (X86Unpcklps VR128:$src1, VR128:$src2)),
(UNPCKLPSrr VR128:$src1, VR128:$src2)>;
@ -5644,11 +5648,15 @@ def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
// Shuffle with UNPCKLPD
def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
(VUNPCKLPDrm VR128:$src1, addr:$src2)>, Requires<[HasAVX]>;
def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
(VUNPCKLPDYrm VR256:$src1, addr:$src2)>, Requires<[HasAVX]>;
def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
(UNPCKLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
(VUNPCKLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasAVX]>;
def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
(VUNPCKLPDYrr VR256:$src1, VR256:$src2)>, Requires<[HasAVX]>;
def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, VR128:$src2)),
(UNPCKLPDrr VR128:$src1, VR128:$src2)>;

View File

@ -0,0 +1,5 @@
load_lib llvm.exp
if { [llvm_supports_target X86] } {
RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
}

View File

@ -0,0 +1,20 @@
; RUN: llc < %s -mattr=+avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) {
entry:
%incarray1 = alloca [2 x <4 x double>]*, align 8
%incarrayb1 = alloca [2 x <4 x double>]*, align 8
%carray = alloca [2 x <4 x double>], align 16
%r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0
%rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0
%r3 = load <4 x double>* %r, align 8
%r4 = load <4 x double>* %rb, align 8
%r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 1, i32 5 > ; <<4 x double>> [#uses=1]
; CHECK-NOT: vunpcklpd
%r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1
store <4 x double> %r11, <4 x double>* %r12, align 4
ret void
}

View File

@ -0,0 +1,20 @@
; RUN: llc < %s -mattr=+avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) {
enmtry:
%incarray1 = alloca [2 x <8 x float>]*, align 8
%incarrayb1 = alloca [2 x <8 x float>]*, align 8
%carray = alloca [2 x <8 x float>], align 16
%r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0
%rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0
%r3 = load <8 x float>* %r, align 8
%r4 = load <8 x float>* %rb, align 8
%r8 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11 > ; <<8 x float>> [#uses=1]
; CHECK-NOT: vunpcklps
%r9 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 0
store <8 x float> %r8, <8 x float>* %r9, align 4
ret void
}

View File

@ -0,0 +1,20 @@
; RUN: llc < %s -mattr=+avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @try_([2 x <4 x double>]* noalias %incarray, [2 x <4 x double>]* noalias %incarrayb ) {
entry:
%incarray1 = alloca [2 x <4 x double>]*, align 8
%incarrayb1 = alloca [2 x <4 x double>]*, align 8
%carray = alloca [2 x <4 x double>], align 16
%r = getelementptr [2 x <4 x double>]* %incarray, i32 0, i32 0
%rb = getelementptr [2 x <4 x double>]* %incarrayb, i32 0, i32 0
%r3 = load <4 x double>* %r, align 8
%r4 = load <4 x double>* %rb, align 8
%r11 = shufflevector <4 x double> %r3, <4 x double> %r4, <4 x i32> < i32 0, i32 4, i32 2, i32 6 > ; <<4 x double>> [#uses=1]
; CHECK: vunpcklpd
%r12 = getelementptr [2 x <4 x double>]* %carray, i32 0, i32 1
store <4 x double> %r11, <4 x double>* %r12, align 4
ret void
}

View File

@ -0,0 +1,20 @@
; RUN: llc < %s -mattr=+avx | FileCheck %s
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
define void @try_([2 x <8 x float>]* noalias %incarray, [2 x <8 x float>]* noalias %incarrayb ) {
entry:
%incarray1 = alloca [2 x <8 x float>]*, align 8
%incarrayb1 = alloca [2 x <8 x float>]*, align 8
%carray = alloca [2 x <8 x float>], align 16
%r = getelementptr [2 x <8 x float>]* %incarray, i32 0, i32 0
%rb = getelementptr [2 x <8 x float>]* %incarrayb, i32 0, i32 0
%r3 = load <8 x float>* %r, align 8
%r4 = load <8 x float>* %rb, align 8
%r11 = shufflevector <8 x float> %r3, <8 x float> %r4, <8 x i32> < i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13 > ; <<8 x float>> [#uses=1]
; CHECK: vunpcklps
%r12 = getelementptr [2 x <8 x float>]* %carray, i32 0, i32 1
store <8 x float> %r11, <8 x float>* %r12, align 4
ret void
}