[x86] Remove some windows line endings that snuck into the tests here.

Folks on Windows, remember to set up your subversion to strip these when submitting... git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@225593 91177308-0d34-0410-b5e6-96231b3b80d8
2024-12-14 23:48:49 +00:00 · 2015-01-11 01:36:20 +00:00 · 2015-01-11 01:36:20 +00:00 · 561088eb5d
commit 561088eb5d
parent 6c92f6eb6a
8 changed files with 463 additions and 463 deletions
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@ -455,21 +455,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
  ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone


@ -551,21 +551,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
  ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone


--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@ -158,21 +158,21 @@ define <8 x i32> @test_x86_avx2_psll_d(<8 x i32> %a0, <4 x i32> %a1) {
  ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
-  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psll_dq(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psll.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psll_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8],zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24]
+  %res = call <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
 declare <4 x i64> @llvm.x86.avx2.psll.dq.bs(<4 x i64>, i32) nounwind readnone


@ -254,21 +254,21 @@ define <8 x i32> @test_x86_avx2_psrl_d(<8 x i32> %a0, <4 x i32> %a1) {
  ret <8 x i32> %res
 }
 declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
-declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
-
-
-define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
-  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
-  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
-  ret <4 x i64> %res
-}
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31]
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.dq(<4 x i64>, i32) nounwind readnone
+
+
+define <4 x i64> @test_x86_avx2_psrl_dq_bs(<4 x i64> %a0) {
+  ; CHECK: vpsrldq {{.*#+}} ymm0 = ymm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,ymm0[23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero,zero
+  %res = call <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
+  ret <4 x i64> %res
+}
 declare <4 x i64> @llvm.x86.avx2.psrl.dq.bs(<4 x i64>, i32) nounwind readnone


--- a/test/CodeGen/X86/fold-tied-op.ll
+++ b/test/CodeGen/X86/fold-tied-op.ll
@ -1,84 +1,84 @@
-; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
-; Regression test for http://reviews.llvm.org/D5701
-
-; ModuleID = 'xxhash.i'
-target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
-target triple = "i386--netbsd"
-
-; CHECK-LABEL: fn1
-; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
-; CHECK:       imull {{.*#+}} 4-byte Folded Reload
-; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
-; CHECK:       retl
-
-%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
-
-@a = common global i32 0, align 4
-@b = common global i64 0, align 8
-
-; Function Attrs: nounwind uwtable
-define i64 @fn1() #0 {
-entry:
-  %0 = load i32* @a, align 4, !tbaa !1
-  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
-  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
-  %2 = load i32* %total_len, align 4, !tbaa !5
-  %tobool = icmp eq i32 %2, 0
-  br i1 %tobool, label %if.else, label %if.then
-
-if.then:                                          ; preds = %entry
-  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
-  %3 = load i64* %v3, align 4, !tbaa !8
-  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
-  %4 = load i64* %v4, align 4, !tbaa !9
-  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
-  %5 = load i64* %v2, align 4, !tbaa !10
-  %shl = shl i64 %5, 1
-  %or = or i64 %shl, %5
-  %shl2 = shl i64 %3, 2
-  %shr = lshr i64 %3, 1
-  %or3 = or i64 %shl2, %shr
-  %add = add i64 %or, %or3
-  %mul = mul i64 %4, -4417276706812531889
-  %shl4 = mul i64 %4, -8834553413625063778
-  %shr5 = ashr i64 %mul, 3
-  %or6 = or i64 %shr5, %shl4
-  %mul7 = mul nsw i64 %or6, 1400714785074694791
-  %xor = xor i64 %add, %mul7
-  store i64 %xor, i64* @b, align 8, !tbaa !11
-  %mul8 = mul nsw i64 %xor, 1400714785074694791
-  br label %if.end
-
-if.else:                                          ; preds = %entry
-  %6 = load i64* @b, align 8, !tbaa !11
-  %xor10 = xor i64 %6, -4417276706812531889
-  %mul11 = mul nsw i64 %xor10, 400714785074694791
-  br label %if.end
-
-if.end:                                           ; preds = %if.else, %if.then
-  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
-  %storemerge = add i64 %storemerge.in, -8796714831421723037
-  store i64 %storemerge, i64* @b, align 8, !tbaa !11
-  ret i64 undef
-}
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 3.6 (trunk 219587)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"int", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C/C++ TBAA"}
-!5 = !{!6, !2, i64 0}
-!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24}
-!7 = !{!"long long", !3, i64 0}
-!8 = !{!6, !7, i64 16}
-!9 = !{!6, !7, i64 24}
-!10 = !{!6, !7, i64 8}
-!11 = !{!7, !7, i64 0}
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; Regression test for http://reviews.llvm.org/D5701
+
+; ModuleID = 'xxhash.i'
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386--netbsd"
+
+; CHECK-LABEL: fn1
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       shldl {{.*#+}} 4-byte Folded Spill
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       addl  {{.*#+}} 4-byte Folded Reload
+; CHECK:       imull {{.*#+}} 4-byte Folded Reload
+; CHECK:       orl   {{.*#+}} 4-byte Folded Reload
+; CHECK:       retl
+
+%struct.XXH_state64_t = type { i32, i32, i64, i64, i64 }
+
+@a = common global i32 0, align 4
+@b = common global i64 0, align 8
+
+; Function Attrs: nounwind uwtable
+define i64 @fn1() #0 {
+entry:
+  %0 = load i32* @a, align 4, !tbaa !1
+  %1 = inttoptr i32 %0 to %struct.XXH_state64_t*
+  %total_len = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 0
+  %2 = load i32* %total_len, align 4, !tbaa !5
+  %tobool = icmp eq i32 %2, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %v3 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 3
+  %3 = load i64* %v3, align 4, !tbaa !8
+  %v4 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 4
+  %4 = load i64* %v4, align 4, !tbaa !9
+  %v2 = getelementptr inbounds %struct.XXH_state64_t* %1, i32 0, i32 2
+  %5 = load i64* %v2, align 4, !tbaa !10
+  %shl = shl i64 %5, 1
+  %or = or i64 %shl, %5
+  %shl2 = shl i64 %3, 2
+  %shr = lshr i64 %3, 1
+  %or3 = or i64 %shl2, %shr
+  %add = add i64 %or, %or3
+  %mul = mul i64 %4, -4417276706812531889
+  %shl4 = mul i64 %4, -8834553413625063778
+  %shr5 = ashr i64 %mul, 3
+  %or6 = or i64 %shr5, %shl4
+  %mul7 = mul nsw i64 %or6, 1400714785074694791
+  %xor = xor i64 %add, %mul7
+  store i64 %xor, i64* @b, align 8, !tbaa !11
+  %mul8 = mul nsw i64 %xor, 1400714785074694791
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %6 = load i64* @b, align 8, !tbaa !11
+  %xor10 = xor i64 %6, -4417276706812531889
+  %mul11 = mul nsw i64 %xor10, 400714785074694791
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge.in = phi i64 [ %mul11, %if.else ], [ %mul8, %if.then ]
+  %storemerge = add i64 %storemerge.in, -8796714831421723037
+  store i64 %storemerge, i64* @b, align 8, !tbaa !11
+  ret i64 undef
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.6 (trunk 219587)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
+!5 = !{!6, !2, i64 0}
+!6 = !{!"XXH_state64_t", !2, i64 0, !2, i64 4, !7, i64 8, !7, i64 16, !7, i64 24}
+!7 = !{!"long long", !3, i64 0}
+!8 = !{!6, !7, i64 16}
+!9 = !{!6, !7, i64 24}
+!10 = !{!6, !7, i64 8}
+!11 = !{!7, !7, i64 0}
--- a/test/CodeGen/X86/movtopush.ll
+++ b/test/CodeGen/X86/movtopush.ll
@ -1,112 +1,112 @@
-; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
-; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
-declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
-declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
-
-; Here, we should have a reserved frame, so we don't expect pushes
-; NORMAL-LABEL: test1
-; NORMAL: subl    $16, %esp
-; NORMAL-NEXT: movl    $4, 12(%esp)
-; NORMAL-NEXT: movl    $3, 8(%esp)
-; NORMAL-NEXT: movl    $2, 4(%esp)
-; NORMAL-NEXT: movl    $1, (%esp)
-; NORMAL-NEXT: call
-define void @test1() {
-entry:
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; Here, we expect a sequence of 4 immediate pushes
-; NORMAL-LABEL: test2
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl   $4
-; NORMAL-NEXT: pushl   $3
-; NORMAL-NEXT: pushl   $2
-; NORMAL-NEXT: pushl   $1
-; NORMAL-NEXT: call
-define void @test2(i32 %k) {
-entry:
-  %a = alloca i32, i32 %k
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; Again, we expect a sequence of 4 immediate pushes
-; Checks that we generate the right pushes for >8bit immediates
-; NORMAL-LABEL: test2b
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl   $4096
-; NORMAL-NEXT: pushl   $3072
-; NORMAL-NEXT: pushl   $2048
-; NORMAL-NEXT: pushl   $1024
-; NORMAL-NEXT: call
-define void @test2b(i32 %k) {
-entry:
-  %a = alloca i32, i32 %k
-  call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
-  ret void
-}
-
-; The first push should push a register
-; NORMAL-LABEL: test3
-; NORMAL-NOT: subl {{.*}} %esp
-; NORMAL: pushl   $4
-; NORMAL-NEXT: pushl   $3
-; NORMAL-NEXT: pushl   $2
-; NORMAL-NEXT: pushl   %e{{..}}
-; NORMAL-NEXT: call
-define void @test3(i32 %k) {
-entry:
-  %a = alloca i32, i32 %k
-  call void @good(i32 %k, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; We don't support weird calling conventions
-; NORMAL-LABEL: test4
-; NORMAL: subl    $12, %esp
-; NORMAL-NEXT: movl    $4, 8(%esp)
-; NORMAL-NEXT: movl    $3, 4(%esp)
-; NORMAL-NEXT: movl    $1, (%esp)
-; NORMAL-NEXT: movl    $2, %eax
-; NORMAL-NEXT: call
-define void @test4(i32 %k) {
-entry:
-  %a = alloca i32, i32 %k
-  call void @inreg(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; Check that additional alignment is added when the pushes
-; don't add up to the required alignment.
-; ALIGNED-LABEL: test5
-; ALIGNED: subl    $16, %esp
-; ALIGNED-NEXT: pushl   $4
-; ALIGNED-NEXT: pushl   $3
-; ALIGNED-NEXT: pushl   $2
-; ALIGNED-NEXT: pushl   $1
-; ALIGNED-NEXT: call
-define void @test5(i32 %k) {
-entry:
-  %a = alloca i32, i32 %k
-  call void @good(i32 1, i32 2, i32 3, i32 4)
-  ret void
-}
-
-; Check that pushing the addresses of globals (Or generally, things that 
-; aren't exactly immediates) isn't broken.
-; Fixes PR21878.
-; NORMAL-LABEL: test6
-; NORMAL: pushl    $_ext
-; NORMAL-NEXT: call
-declare void @f(i8*)
-@ext = external constant i8
-
-define void @test6() {
-  call void @f(i8* @ext)
-  br label %bb
-bb:
-  alloca i32
-  ret void
-}
+; RUN: llc < %s -mtriple=i686-windows | FileCheck %s -check-prefix=NORMAL
+; RUN: llc < %s -mtriple=i686-windows -force-align-stack -stack-alignment=32 | FileCheck %s -check-prefix=ALIGNED 
+declare void @good(i32 %a, i32 %b, i32 %c, i32 %d)
+declare void @inreg(i32 %a, i32 inreg %b, i32 %c, i32 %d)
+
+; Here, we should have a reserved frame, so we don't expect pushes
+; NORMAL-LABEL: test1
+; NORMAL: subl    $16, %esp
+; NORMAL-NEXT: movl    $4, 12(%esp)
+; NORMAL-NEXT: movl    $3, 8(%esp)
+; NORMAL-NEXT: movl    $2, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: call
+define void @test1() {
+entry:
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Here, we expect a sequence of 4 immediate pushes
+; NORMAL-LABEL: test2
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   $1
+; NORMAL-NEXT: call
+define void @test2(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Again, we expect a sequence of 4 immediate pushes
+; Checks that we generate the right pushes for >8bit immediates
+; NORMAL-LABEL: test2b
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4096
+; NORMAL-NEXT: pushl   $3072
+; NORMAL-NEXT: pushl   $2048
+; NORMAL-NEXT: pushl   $1024
+; NORMAL-NEXT: call
+define void @test2b(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1024, i32 2048, i32 3072, i32 4096)
+  ret void
+}
+
+; The first push should push a register
+; NORMAL-LABEL: test3
+; NORMAL-NOT: subl {{.*}} %esp
+; NORMAL: pushl   $4
+; NORMAL-NEXT: pushl   $3
+; NORMAL-NEXT: pushl   $2
+; NORMAL-NEXT: pushl   %e{{..}}
+; NORMAL-NEXT: call
+define void @test3(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 %k, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; We don't support weird calling conventions
+; NORMAL-LABEL: test4
+; NORMAL: subl    $12, %esp
+; NORMAL-NEXT: movl    $4, 8(%esp)
+; NORMAL-NEXT: movl    $3, 4(%esp)
+; NORMAL-NEXT: movl    $1, (%esp)
+; NORMAL-NEXT: movl    $2, %eax
+; NORMAL-NEXT: call
+define void @test4(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @inreg(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Check that additional alignment is added when the pushes
+; don't add up to the required alignment.
+; ALIGNED-LABEL: test5
+; ALIGNED: subl    $16, %esp
+; ALIGNED-NEXT: pushl   $4
+; ALIGNED-NEXT: pushl   $3
+; ALIGNED-NEXT: pushl   $2
+; ALIGNED-NEXT: pushl   $1
+; ALIGNED-NEXT: call
+define void @test5(i32 %k) {
+entry:
+  %a = alloca i32, i32 %k
+  call void @good(i32 1, i32 2, i32 3, i32 4)
+  ret void
+}
+
+; Check that pushing the addresses of globals (Or generally, things that 
+; aren't exactly immediates) isn't broken.
+; Fixes PR21878.
+; NORMAL-LABEL: test6
+; NORMAL: pushl    $_ext
+; NORMAL-NEXT: call
+declare void @f(i8*)
+@ext = external constant i8
+
+define void @test6() {
+  call void @f(i8* @ext)
+  br label %bb
+bb:
+  alloca i32
+  ret void
+}
--- a/test/CodeGen/X86/pr22103.ll
+++ b/test/CodeGen/X86/pr22103.ll
@ -1,19 +1,19 @@
-; RUN: llc < %s | FileCheck %s
-; Don't try to emit a direct call through a TLS global.
-; This fixes PR22103
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@a = external thread_local global i64
-
-; Function Attrs: nounwind
-define void @_Z1fv() {
-; CHECK-NOT: callq *$a
-; CHECK: movq %fs:0, [[RAX:%r..]]
-; CHECK-NEXT: addq    a@GOTTPOFF(%rip), [[RAX]]
-; CHECK-NEXT: callq *[[RAX]]
-entry:
-  call void bitcast (i64* @a to void ()*)()
-  ret void
-}
+; RUN: llc < %s | FileCheck %s
+; Don't try to emit a direct call through a TLS global.
+; This fixes PR22103
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@a = external thread_local global i64
+
+; Function Attrs: nounwind
+define void @_Z1fv() {
+; CHECK-NOT: callq *$a
+; CHECK: movq %fs:0, [[RAX:%r..]]
+; CHECK-NEXT: addq    a@GOTTPOFF(%rip), [[RAX]]
+; CHECK-NEXT: callq *[[RAX]]
+entry:
+  call void bitcast (i64* @a to void ()*)()
+  ret void
+}
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@ -408,21 +408,21 @@ define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
  ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
-  ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
-  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psll_dq(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psll.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
+  ; CHECK: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8]
+  %res = call <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psll.dq.bs(<2 x i64>, i32) nounwind readnone


@ -504,21 +504,21 @@ define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
  ret <4 x i32> %res
 }
 declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
-  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
-  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.dq(<2 x i64>, i32) nounwind readnone
+
+
+define <2 x i64> @test_x86_sse2_psrl_dq_bs(<2 x i64> %a0) {
+  ; CHECK: psrldq {{.*#+}} xmm0 = xmm0[7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero
+  %res = call <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
+  ret <2 x i64> %res
+}
 declare <2 x i64> @llvm.x86.sse2.psrl.dq.bs(<2 x i64>, i32) nounwind readnone


--- a/test/CodeGen/X86/vec_extract-avx.ll
+++ b/test/CodeGen/X86/vec_extract-avx.ll
@ -1,82 +1,82 @@
-target triple = "x86_64-unknown-unknown"
-
-; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
-
-; When extracting multiple consecutive elements from a larger
-; vector into a smaller one, do it efficiently. We should use
-; an EXTRACT_SUBVECTOR node internally rather than a bunch of
-; single element extractions. 
-
-; Extracting the low elements only requires using the right kind of store.
-define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-  %ext0 = extractelement <8 x float> %v, i32 0
-  %ext1 = extractelement <8 x float> %v, i32 1
-  %ext2 = extractelement <8 x float> %v, i32 2
-  %ext3 = extractelement <8 x float> %v, i32 3
-  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
-  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
-  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
-  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
-  store <4 x float> %ins3, <4 x float>* %ptr, align 16
-  ret void
-
-; CHECK-LABEL: low_v8f32_to_v4f32
-; CHECK: vmovaps
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-}
-
-; Extracting the high elements requires just one AVX instruction. 
-define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
-  %ext0 = extractelement <8 x float> %v, i32 4
-  %ext1 = extractelement <8 x float> %v, i32 5
-  %ext2 = extractelement <8 x float> %v, i32 6
-  %ext3 = extractelement <8 x float> %v, i32 7
-  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
-  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
-  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
-  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
-  store <4 x float> %ins3, <4 x float>* %ptr, align 16
-  ret void
-
-; CHECK-LABEL: high_v8f32_to_v4f32
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-}
-
-; Make sure element type doesn't alter the codegen. Note that
-; if we were actually using the vector in this function and
-; have AVX2, we should generate vextracti128 (the int version).
-define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
-  %ext0 = extractelement <8 x i32> %v, i32 4
-  %ext1 = extractelement <8 x i32> %v, i32 5
-  %ext2 = extractelement <8 x i32> %v, i32 6
-  %ext3 = extractelement <8 x i32> %v, i32 7
-  %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
-  %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1
-  %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2
-  %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
-  store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
-  ret void
-
-; CHECK-LABEL: high_v8i32_to_v4i32
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-}
-
-; Make sure that element size doesn't alter the codegen.
-define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
-  %ext0 = extractelement <4 x double> %v, i32 2
-  %ext1 = extractelement <4 x double> %v, i32 3
-  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
-  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
-  store <2 x double> %ins1, <2 x double>* %ptr, align 16
-  ret void
-
-; CHECK-LABEL: high_v4f64_to_v2f64
-; CHECK: vextractf128
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
-}
+target triple = "x86_64-unknown-unknown"
+
+; RUN: llc < %s -march=x86-64 -mattr=+avx | FileCheck %s
+
+; When extracting multiple consecutive elements from a larger
+; vector into a smaller one, do it efficiently. We should use
+; an EXTRACT_SUBVECTOR node internally rather than a bunch of
+; single element extractions. 
+
+; Extracting the low elements only requires using the right kind of store.
+define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 0
+  %ext1 = extractelement <8 x float> %v, i32 1
+  %ext2 = extractelement <8 x float> %v, i32 2
+  %ext3 = extractelement <8 x float> %v, i32 3
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: low_v8f32_to_v4f32
+; CHECK: vmovaps
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Extracting the high elements requires just one AVX instruction. 
+define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
+  %ext0 = extractelement <8 x float> %v, i32 4
+  %ext1 = extractelement <8 x float> %v, i32 5
+  %ext2 = extractelement <8 x float> %v, i32 6
+  %ext3 = extractelement <8 x float> %v, i32 7
+  %ins0 = insertelement <4 x float> undef, float %ext0, i32 0
+  %ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
+  %ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
+  %ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
+  store <4 x float> %ins3, <4 x float>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8f32_to_v4f32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure element type doesn't alter the codegen. Note that
+; if we were actually using the vector in this function and
+; have AVX2, we should generate vextracti128 (the int version).
+define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
+  %ext0 = extractelement <8 x i32> %v, i32 4
+  %ext1 = extractelement <8 x i32> %v, i32 5
+  %ext2 = extractelement <8 x i32> %v, i32 6
+  %ext3 = extractelement <8 x i32> %v, i32 7
+  %ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
+  store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v8i32_to_v4i32
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
+
+; Make sure that element size doesn't alter the codegen.
+define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
+  %ext0 = extractelement <4 x double> %v, i32 2
+  %ext1 = extractelement <4 x double> %v, i32 3
+  %ins0 = insertelement <2 x double> undef, double %ext0, i32 0
+  %ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
+  store <2 x double> %ins1, <2 x double>* %ptr, align 16
+  ret void
+
+; CHECK-LABEL: high_v4f64_to_v2f64
+; CHECK: vextractf128
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
+}
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@ -467,23 +467,23 @@ define <16 x i8> @PR20540(<8 x i8> %a) {
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,4,4]
 ; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR20540:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: PR20540:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: PR20540:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
-  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
-  ret <16 x i8> %shuffle
+;
+; SSSE3-LABEL: PR20540:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: PR20540:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: PR20540:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %shuffle = shufflevector <8 x i8> %a, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x i8> %shuffle
 }

 define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(i8 %i) {
@ -493,25 +493,25 @@ define <16 x i8> @shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
-  %a = insertelement <16 x i8> undef, i8 %i, i32 0
-  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i8> %shuffle
 }

@ -523,25 +523,25 @@ define <16 x i8> @shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10]
 ; SSE2-NEXT:    retq
 ;
-; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
-  %a = insertelement <16 x i8> undef, i8 %i, i32 0
-  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+; SSSE3-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_zz_zz_zz_16_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 0
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  ret <16 x i8> %shuffle
 }

@ -571,27 +571,27 @@ define <16 x i8> @shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz(
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSSE3:       # BB#0:
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX-NEXT:    retq
-  %a = insertelement <16 x i8> undef, i8 %i, i32 3
-  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movd %edi, %xmm0
+; SSSE3-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: shuffle_v16i8_zz_zz_19_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz_zz:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovd %edi, %xmm0
+; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    retq
+  %a = insertelement <16 x i8> undef, i8 %i, i32 3
+  %shuffle = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 1, i32 19, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  ret <16 x i8> %shuffle
 }