Add support for emitting non-temporal stores for DAGs marked

non-temporal.  Fix from r96241 for botched encoding of MOVNTDQ.

Add documentation for !nontemporal metadata.

Add a simpler movnt testcase.

llvm-svn: 96386
This commit is contained in:
David Greene 2010-02-16 20:50:18 +00:00
parent 7bb549dc8e
commit c10133139e
3 changed files with 115 additions and 13 deletions

View File

@ -4074,8 +4074,9 @@ Instruction</a> </div>
<h5>Syntax:</h5>
<pre>
&lt;result&gt; = load &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;]
&lt;result&gt; = volatile load &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;]
&lt;result&gt; = load &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;][, !nontemporal !<index>]
&lt;result&gt; = volatile load &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;][, !nontemporal !<index>]
!<index> = !{ i32 1 }
</pre>
<h5>Overview:</h5>
@ -4088,7 +4089,7 @@ Instruction</a> </div>
marked as <tt>volatile</tt>, then the optimizer is not allowed to modify the
number or order of execution of this <tt>load</tt> with other
volatile <tt>load</tt> and <tt><a href="#i_store">store</a></tt>
instructions. </p>
instructions.</p>
<p>The optional constant "align" argument specifies the alignment of the
operation (that is, the alignment of the memory address). A value of 0 or an
@ -4098,6 +4099,14 @@ Instruction</a> </div>
alignment results in an undefined behavior. Underestimating the alignment may
produce less efficient code. An alignment of 1 is always safe.</p>
<p>The optional !nontemporal metadata must reference a single metatadata
name <index> corresponding to a metadata node with one i32 entry of
value 1. The existance of the !nontemporal metatadata on the
instruction tells the optimizer and code generator that this load is
not expected to be reused in the cache. The code generator may
select special instructions to save cache bandwidth, such as the
MOVNT intruction on x86.</p>
<h5>Semantics:</h5>
<p>The location of memory pointed to is loaded. If the value being loaded is of
scalar type then the number of bytes read does not exceed the minimum number
@ -4124,8 +4133,8 @@ Instruction</a> </div>
<h5>Syntax:</h5>
<pre>
store &lt;ty&gt; &lt;value&gt;, &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;] <i>; yields {void}</i>
volatile store &lt;ty&gt; &lt;value&gt;, &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;] <i>; yields {void}</i>
store &lt;ty&gt; &lt;value&gt;, &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;][, !nontemporal !<index>] <i>; yields {void}</i>
volatile store &lt;ty&gt; &lt;value&gt;, &lt;ty&gt;* &lt;pointer&gt;[, align &lt;alignment&gt;][, !nontemporal !<index>] <i>; yields {void}</i>
</pre>
<h5>Overview:</h5>
@ -4150,6 +4159,15 @@ Instruction</a> </div>
alignment results in an undefined behavior. Underestimating the alignment may
produce less efficient code. An alignment of 1 is always safe.</p>
<p>The optional !nontemporal metadata must reference a single metatadata
name <index> corresponding to a metadata node with one i32 entry of
value 1. The existance of the !nontemporal metatadata on the
instruction tells the optimizer and code generator that this load is
not expected to be reused in the cache. The code generator may
select special instructions to save cache bandwidth, such as the
MOVNT intruction on x86.</p>
<h5>Semantics:</h5>
<p>The contents of memory are updated to contain '<tt>&lt;value&gt;</tt>' at the
location specified by the '<tt>&lt;pointer&gt;</tt>' operand. If

View File

@ -160,6 +160,32 @@ def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
// MOVNT Support
// Like 'store', but requires the non-temporal bit to be set
def nontemporalstore : PatFrag<(ops node:$val, node:$ptr),
(st node:$val, node:$ptr), [{
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
return ST->isNonTemporal();
return false;
}]>;
def alignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
(st node:$val, node:$ptr), [{
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
return ST->isNonTemporal() && !ST->isTruncatingStore() &&
ST->getAddressingMode() == ISD::UNINDEXED &&
ST->getAlignment() >= 16;
return false;
}]>;
def unalignednontemporalstore : PatFrag<(ops node:$val, node:$ptr),
(st node:$val, node:$ptr), [{
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N))
return ST->isNonTemporal() &&
ST->getAlignment() < 16;
return false;
}]>;
def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
@ -1013,10 +1039,33 @@ def PREFETCHNTA : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
"prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
// Non-temporal stores
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
def MOVNTPSmr_Int : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
def MOVNTDQ_64mr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v2f64 VR128:$src), addr:$dst)]>;
def : Pat<(alignednontemporalstore (v2i64 VR128:$src), addr:$dst),
(MOVNTDQ_64mr VR128:$src, addr:$dst)>;
def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti\t{$src, $dst|$dst, $src}",
[(nontemporalstore (i32 GR32:$src), addr:$dst)]>,
TB, Requires<[HasSSE2]>;
def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
"movnti\t{$src, $dst|$dst, $src}",
[(nontemporalstore (i64 GR64:$src), addr:$dst)]>,
TB, Requires<[HasSSE2]>;
}
// Load, store, and memory fence
def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
@ -2298,17 +2347,30 @@ def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
// Non-temporal stores
def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
def MOVNTPDmr_Int : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
def MOVNTDQmr_Int : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
def MOVNTImr_Int : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti\t{$src, $dst|$dst, $src}",
[(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
TB, Requires<[HasSSE2]>;
let AddedComplexity = 400 in { // Prefer non-temporal versions
def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)]>;
def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)]>;
def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
(MOVNTDQmr VR128:$src, addr:$dst)>;
}
// Flush cache
def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
"clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,

View File

@ -0,0 +1,22 @@
; RUN: llc < %s -march=x86-64 | FileCheck %s
; CHECK: movnt
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
target triple = "x86_64-unknown-linux-gnu"
!0 = metadata !{ i32 1 }
define void @sub_(i32* noalias %n) {
"file movnt.f90, line 2, bb1":
%n1 = alloca i32*, align 8
%i = alloca i32, align 4
%"$LCS_0" = alloca i64, align 8
%"$LCS_S2" = alloca <2 x double>, align 16
%r9 = load <2 x double>* %"$LCS_S2", align 8
%r10 = load i64* %"$LCS_0", align 8
%r11 = inttoptr i64 %r10 to <2 x double>*
store <2 x double> %r9, <2 x double>* %r11, align 16, !nontemporal !0
br label %"file movnt.f90, line 18, bb5"
"file movnt.f90, line 18, bb5":
ret void
}