[X86] Loosen memory folding requirements for cvtdq2pd and cvtps2pd instructions.

According to spec cvtdq2pd and cvtps2pd instructions don't require memory operand to be aligned to 16 bytes. This patch removes this requirement from the memory folding table. Differential Revision: https://reviews.llvm.org/D23919 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@280402 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-28 07:05:11 +00:00 · 2016-09-01 18:50:02 +00:00 · 2016-09-01 18:50:02 +00:00 · fa2569c73b
commit fa2569c73b
parent 7f885e7bc0
2 changed files with 41 additions and 2 deletions
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@ -477,12 +477,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          0 },
    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        0 },
    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          0 },
-    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_ALIGN_16 },
+    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          0 },
    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
-    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_ALIGN_16 },
+    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          0 },
    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  0 },
--- a/test/CodeGen/X86/peephole-cvt-sse.ll
+++ b/test/CodeGen/X86/peephole-cvt-sse.ll
@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=+sse4.2 < %s | FileCheck %s --check-prefix=X86-64
+; RUN: llc -mtriple=i386-pc-linux -mattr=+sse4.2 < %s | FileCheck %s --check-prefix=I386
+
+; Check that unaligned loads merge with cvtdq2pd and cvtps2pd.
+
+define <2 x double> @peephole_cvtps2pd(<4 x float>* %a0) {
+; X86-64-LABEL: peephole_cvtps2pd:
+; X86-64:       # BB#0:
+; X86-64-NEXT:    cvtps2pd (%rdi), %xmm0
+; X86-64-NEXT:    retq
+;
+; I386-LABEL: peephole_cvtps2pd:
+; I386:       # BB#0:
+; I386-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    cvtps2pd (%eax), %xmm0
+; I386-NEXT:    retl
+  %1 = load <4 x float>, <4 x float>* %a0, align 1
+  %2 = shufflevector <4 x float> %1, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  %3 = fpext <2 x float> %2 to <2 x double>
+  ret <2 x double> %3
+}
+
+define <2 x double> @peephole_cvtdq2pd(<4 x i32>* %a0) {
+; X86-64-LABEL: peephole_cvtdq2pd:
+; X86-64:       # BB#0:
+; X86-64-NEXT:    cvtdq2pd (%rdi), %xmm0
+; X86-64-NEXT:    retq
+;
+; I386-LABEL: peephole_cvtdq2pd:
+; I386:       # BB#0:
+; I386-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; I386-NEXT:    cvtdq2pd (%eax), %xmm0
+; I386-NEXT:    retl
+  %1 = load <4 x i32>, <4 x i32>* %a0, align 1
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sitofp <2 x i32> %2 to <2 x double>
+  ret <2 x double> %3
+}