diff --git a/test/CodeGen/X86/combine-multiplies.ll b/test/CodeGen/X86/combine-multiplies.ll index 15528cd0714..ab30b9b489e 100644 --- a/test/CodeGen/X86/combine-multiplies.ll +++ b/test/CodeGen/X86/combine-multiplies.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mattr=sse2 -mtriple=i386-unknown-linux-gnu | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=sse2 | FileCheck %s ; Source file looks something like this: ; @@ -28,18 +29,21 @@ ; movl $11, 2020(%esi,%ecx,4) ; movl $22, 2080(%edx,%eax) ; movl $33, 10080(%edx,%eax) -; -; CHECK-LABEL: testCombineMultiplies -; CHECK: imull $400, [[ARG1:%[a-z]+]], [[MUL:%[a-z]+]] # imm = 0x190 -; CHECK-NEXT: leal ([[ARG2:%[a-z]+]],[[MUL]]), [[LEA:%[a-z]+]] -; CHECK-NEXT: movl $11, {{[0-9]+}}([[LEA]],[[ARG1]],4) -; CHECK-NEXT: movl $22, {{[0-9]+}}([[ARG2]],[[MUL]]) -; CHECK-NEXT: movl $33, {{[0-9]+}}([[ARG2]],[[MUL]]) -; CHECK: retl -; ; Function Attrs: nounwind -define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) { +define void @testCombineMultiplies([100 x i32]* nocapture %a, i32 %lll) nounwind { +; CHECK-LABEL: testCombineMultiplies: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: imull $400, %ecx, %edx # imm = 0x190 +; CHECK-NEXT: leal (%eax,%edx), %esi +; CHECK-NEXT: movl $11, 2020(%esi,%ecx,4) +; CHECK-NEXT: movl $22, 2080(%eax,%edx) +; CHECK-NEXT: movl $33, 10080(%eax,%edx) +; CHECK-NEXT: popl %esi +; CHECK-NEXT: retl entry: %add = add nsw i32 %lll, 5 %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add @@ -92,31 +96,31 @@ entry: ; We should have a single multiply for "v1 * {22, 22, 22, 22}" (made up of two ; pmuludq instructions), followed by two adds. Without this optimization, we'd ; do 2 adds, followed by 2 multiplies (i.e. 4 pmuludq instructions). -; -; CHECK-LABEL: testCombineMultiplies_splat -; CHECK: movdqa .LCPI1_0, [[C11:%xmm[0-9]]] -; CHECK-NEXT: paddd %xmm0, [[C11]] -; CHECK-NEXT: movdqa .LCPI1_1, [[C22:%xmm[0-9]]] -; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] -; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] -; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] -; CHECK-NEXT: pmuludq [[C22]], [[T4:%xmm[0-9]]] -; CHECK-NEXT: pshufd $232, [[T4]], [[T5:%xmm[0-9]]] -; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] -; CHECK-NEXT: movdqa .LCPI1_2, [[C242:%xmm[0-9]]] -; CHECK-NEXT: paddd [[T6]], [[C242]] -; CHECK-NEXT: paddd .LCPI1_3, [[C726:%xmm[0-9]]] -; CHECK-NEXT: movdqa [[C242]], v2 -; CHECK-NEXT: [[C726]], v3 -; CHECK-NEXT: [[C11]], x -; CHECK-NEXT: retl @v2 = common global <4 x i32> zeroinitializer, align 16 @v3 = common global <4 x i32> zeroinitializer, align 16 @x = common global <4 x i32> zeroinitializer, align 16 ; Function Attrs: nounwind -define void @testCombineMultiplies_splat(<4 x i32> %v1) { +define void @testCombineMultiplies_splat(<4 x i32> %v1) nounwind { +; CHECK-LABEL: testCombineMultiplies_splat: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,11,11,11] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,22,22,22] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pmuludq %xmm2, %xmm3 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,242,242,242] +; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 +; CHECK-NEXT: movdqa %xmm2, v2 +; CHECK-NEXT: movdqa %xmm0, v3 +; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, %mul1 = mul <4 x i32> %add1, @@ -130,27 +134,28 @@ entry: ; Finally, check the non-splatted vector case. This is very similar ; to the previous test case, except for the vector values. -; -; CHECK-LABEL: testCombineMultiplies_non_splat -; CHECK: movdqa .LCPI2_0, [[C11:%xmm[0-9]]] -; CHECK-NEXT: paddd %xmm0, [[C11]] -; CHECK-NEXT: movdqa .LCPI2_1, [[C22:%xmm[0-9]]] -; CHECK-NEXT: pshufd $245, %xmm0, [[T1:%xmm[0-9]]] -; CHECK-NEXT: pmuludq [[C22]], [[T2:%xmm[0-9]]] -; CHECK-NEXT: pshufd $232, [[T2]], [[T3:%xmm[0-9]]] -; CHECK-NEXT: pshufd $245, [[C22]], [[T7:%xmm[0-9]]] -; CHECK-NEXT: pmuludq [[T1]], [[T7]] -; CHECK-NEXT: pshufd $232, [[T7]], [[T5:%xmm[0-9]]] -; CHECK-NEXT: punpckldq [[T5]], [[T6:%xmm[0-9]]] -; CHECK-NEXT: movdqa .LCPI2_2, [[C242:%xmm[0-9]]] -; CHECK-NEXT: paddd [[T6]], [[C242]] -; CHECK-NEXT: paddd .LCPI2_3, [[C726:%xmm[0-9]]] -; CHECK-NEXT: movdqa [[C242]], v2 -; CHECK-NEXT: [[C726]], v3 -; CHECK-NEXT: [[C11]], x -; CHECK-NEXT: retl + ; Function Attrs: nounwind -define void @testCombineMultiplies_non_splat(<4 x i32> %v1) { +define void @testCombineMultiplies_non_splat(<4 x i32> %v1) nounwind { +; CHECK-LABEL: testCombineMultiplies_non_splat: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: movdqa {{.*#+}} xmm1 = [11,22,33,44] +; CHECK-NEXT: paddd %xmm0, %xmm1 +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [22,33,44,55] +; CHECK-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm2, %xmm0 +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] +; CHECK-NEXT: pmuludq %xmm3, %xmm2 +; CHECK-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; CHECK-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [242,726,1452,2420] +; CHECK-NEXT: paddd %xmm0, %xmm2 +; CHECK-NEXT: paddd {{\.LCPI.*}}, %xmm0 +; CHECK-NEXT: movdqa %xmm2, v2 +; CHECK-NEXT: movdqa %xmm0, v3 +; CHECK-NEXT: movdqa %xmm1, x +; CHECK-NEXT: retl entry: %add1 = add <4 x i32> %v1, %mul1 = mul <4 x i32> %add1,