Generate sse-intel-ocl.ll automatically. NFC

2025-01-12 10:52:38 +00:00 · 2022-05-07 22:44:40 +00:00 · 2022-05-07 22:44:40 +00:00 · 5cd690ad9c
commit 5cd690ad9c
parent bead7a2ed5
1 changed files with 246 additions and 62 deletions
--- a/llvm/test/CodeGen/X86/sse-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/sse-intel-ocl.ll
@ -1,68 +1,192 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s

 declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
-; WIN64: testf16_inp
-; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN64: leaq    {{.*}}(%rsp), %rcx
-; WIN64: call
-; WIN64: ret
-
-; WIN32: testf16_inp
-; WIN32: pushl   %eax
-; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
-; WIN32: call
-; WIN32: ret
-
-; NOT_WIN: testf16_inp
-; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
-; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
-; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
-; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
-; NOT_WIN: movq   %rsp, %rdi
-; NOT_WIN: call
-; NOT_WIN: ret

 ;test calling conventions - input parameters
 define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+; WIN32-LABEL: testf16_inp:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    movl %esp, %ebp
+; WIN32-NEXT:    andl $-16, %esp
+; WIN32-NEXT:    subl $80, %esp
+; WIN32-NEXT:    movups 72(%ebp), %xmm4
+; WIN32-NEXT:    movups 8(%ebp), %xmm3
+; WIN32-NEXT:    addps %xmm4, %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm4
+; WIN32-NEXT:    movups 40(%ebp), %xmm5
+; WIN32-NEXT:    movups 24(%ebp), %xmm6
+; WIN32-NEXT:    movl %esp, %eax
+; WIN32-NEXT:    addps %xmm6, %xmm0
+; WIN32-NEXT:    addps %xmm5, %xmm1
+; WIN32-NEXT:    addps %xmm4, %xmm2
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    calll _func_float16_ptr
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    addps (%esp), %xmm0
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
+; WIN32-NEXT:    movl %ebp, %esp
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: testf16_inp:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $104, %rsp
+; WIN64-NEXT:    movaps (%r9), %xmm3
+; WIN64-NEXT:    movaps (%r8), %xmm2
+; WIN64-NEXT:    movaps (%rdx), %xmm1
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    addps (%rax), %xmm0
+; WIN64-NEXT:    addps (%rdx), %xmm1
+; WIN64-NEXT:    addps (%rcx), %xmm2
+; WIN64-NEXT:    addps (%r8), %xmm3
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq func_float16_ptr
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; WIN64-NEXT:    addq $104, %rsp
+; WIN64-NEXT:    retq
+;
+; NOT_WIN-LABEL: testf16_inp:
+; NOT_WIN:       ## %bb.0:
+; NOT_WIN-NEXT:    subq $72, %rsp
+; NOT_WIN-NEXT:    addps %xmm4, %xmm0
+; NOT_WIN-NEXT:    addps %xmm5, %xmm1
+; NOT_WIN-NEXT:    addps %xmm6, %xmm2
+; NOT_WIN-NEXT:    addps %xmm7, %xmm3
+; NOT_WIN-NEXT:    movq %rsp, %rdi
+; NOT_WIN-NEXT:    callq _func_float16_ptr
+; NOT_WIN-NEXT:    addps (%rsp), %xmm0
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; NOT_WIN-NEXT:    addq $72, %rsp
+; NOT_WIN-NEXT:    retq
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>, <16 x float>* %y, align 16
  %3 = fadd <16 x float> %2, %1
  ret <16 x float> %3
 }

-;test calling conventions - preserved registers
-
-; preserved xmm6-xmm15
-; WIN64: testf16_regs
-; WIN64: call
-; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
-; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
-; WIN64: ret
-
-; preserved xmm8-xmm15
-; NOT_WIN: testf16_regs
-; NOT_WIN: call
-; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
-; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
-; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
-; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
-; NOT_WIN: ret
+; test calling conventions - preserved registers

+; preserves xmm6-xmm15 on windows, xmm8-xmm15 on other plateforms.
 define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+; WIN32-LABEL: testf16_regs:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    movl %esp, %ebp
+; WIN32-NEXT:    andl $-16, %esp
+; WIN32-NEXT:    subl $80, %esp
+; WIN32-NEXT:    movups 72(%ebp), %xmm6
+; WIN32-NEXT:    movups 8(%ebp), %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm7
+; WIN32-NEXT:    movups 40(%ebp), %xmm5
+; WIN32-NEXT:    movups 24(%ebp), %xmm4
+; WIN32-NEXT:    movl %esp, %eax
+; WIN32-NEXT:    addps %xmm4, %xmm0
+; WIN32-NEXT:    addps %xmm5, %xmm1
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    addps %xmm6, %xmm3
+; WIN32-NEXT:    pushl %eax
+; WIN32-NEXT:    calll _func_float16_ptr
+; WIN32-NEXT:    addl $4, %esp
+; WIN32-NEXT:    movups 72(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm1
+; WIN32-NEXT:    movups 24(%ebp), %xmm4
+; WIN32-NEXT:    addps %xmm4, %xmm0
+; WIN32-NEXT:    addps (%esp), %xmm0
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm1
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm2
+; WIN32-NEXT:    addps {{[0-9]+}}(%esp), %xmm3
+; WIN32-NEXT:    movl %ebp, %esp
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: testf16_regs:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $168, %rsp
+; WIN64-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movaps (%rax), %xmm6
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movaps (%rax), %xmm7
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movaps (%rax), %xmm8
+; WIN64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movaps (%rax), %xmm9
+; WIN64-NEXT:    movaps (%rcx), %xmm0
+; WIN64-NEXT:    addps %xmm9, %xmm0
+; WIN64-NEXT:    movaps (%rdx), %xmm1
+; WIN64-NEXT:    addps %xmm8, %xmm1
+; WIN64-NEXT:    movaps (%r8), %xmm2
+; WIN64-NEXT:    addps %xmm7, %xmm2
+; WIN64-NEXT:    movaps (%r9), %xmm3
+; WIN64-NEXT:    addps %xmm6, %xmm3
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    callq func_float16_ptr
+; WIN64-NEXT:    addps %xmm6, %xmm3
+; WIN64-NEXT:    addps %xmm7, %xmm2
+; WIN64-NEXT:    addps %xmm8, %xmm1
+; WIN64-NEXT:    addps %xmm9, %xmm0
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; WIN64-NEXT:    addq $168, %rsp
+; WIN64-NEXT:    retq
+;
+; NOT_WIN-LABEL: testf16_regs:
+; NOT_WIN:       ## %bb.0:
+; NOT_WIN-NEXT:    subq $72, %rsp
+; NOT_WIN-NEXT:    movaps %xmm7, %xmm9
+; NOT_WIN-NEXT:    movaps %xmm6, %xmm10
+; NOT_WIN-NEXT:    movaps %xmm5, %xmm11
+; NOT_WIN-NEXT:    movaps %xmm4, %xmm8
+; NOT_WIN-NEXT:    addps %xmm4, %xmm0
+; NOT_WIN-NEXT:    addps %xmm5, %xmm1
+; NOT_WIN-NEXT:    addps %xmm6, %xmm2
+; NOT_WIN-NEXT:    addps %xmm7, %xmm3
+; NOT_WIN-NEXT:    movq %rsp, %rdi
+; NOT_WIN-NEXT:    callq _func_float16_ptr
+; NOT_WIN-NEXT:    addps %xmm9, %xmm3
+; NOT_WIN-NEXT:    addps %xmm10, %xmm2
+; NOT_WIN-NEXT:    addps %xmm11, %xmm1
+; NOT_WIN-NEXT:    addps %xmm8, %xmm0
+; NOT_WIN-NEXT:    addps (%rsp), %xmm0
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; NOT_WIN-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; NOT_WIN-NEXT:    addq $72, %rsp
+; NOT_WIN-NEXT:    retq
  %y = alloca <16 x float>, align 16
  %x = fadd <16 x float> %a, %b
-  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
  %2 = load <16 x float>, <16 x float>* %y, align 16
  %3 = fadd <16 x float> %1, %b
  %4 = fadd <16 x float> %2, %3
@ -70,24 +194,84 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 }

 ; test calling conventions - prolog and epilog
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
-; NOT_WIN: call
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
-; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
 define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+; WIN32-LABEL: test_prolog_epilog:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    movl %esp, %ebp
+; WIN32-NEXT:    andl $-16, %esp
+; WIN32-NEXT:    subl $96, %esp
+; WIN32-NEXT:    movups 8(%ebp), %xmm4
+; WIN32-NEXT:    movups 24(%ebp), %xmm5
+; WIN32-NEXT:    movups 40(%ebp), %xmm6
+; WIN32-NEXT:    movups 56(%ebp), %xmm7
+; WIN32-NEXT:    movups %xmm7, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups %xmm6, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups %xmm5, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups %xmm4, {{[0-9]+}}(%esp)
+; WIN32-NEXT:    movups %xmm3, (%esp)
+; WIN32-NEXT:    calll _func_float16
+; WIN32-NEXT:    movl %ebp, %esp
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: test_prolog_epilog:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $232, %rsp
+; WIN64-NEXT:    movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps (%r9), %xmm4
+; WIN64-NEXT:    movaps (%rdx), %xmm5
+; WIN64-NEXT:    movaps (%r8), %xmm6
+; WIN64-NEXT:    movaps (%rcx), %xmm7
+; WIN64-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm2, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm3, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm7, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm6, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm5, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    movaps %xmm4, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rax
+; WIN64-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r8
+; WIN64-NEXT:    leaq {{[0-9]+}}(%rsp), %r9
+; WIN64-NEXT:    callq func_float16
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN64-NEXT:    addq $232, %rsp
+; WIN64-NEXT:    retq
+;
+; NOT_WIN-LABEL: test_prolog_epilog:
+; NOT_WIN:       ## %bb.0:
+; NOT_WIN-NEXT:    subq $136, %rsp
+; NOT_WIN-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) ## 16-byte Spill
+; NOT_WIN-NEXT:    movaps %xmm8, (%rsp) ## 16-byte Spill
+; NOT_WIN-NEXT:    callq _func_float16
+; NOT_WIN-NEXT:    movaps (%rsp), %xmm8 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 ## 16-byte Reload
+; NOT_WIN-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 ## 16-byte Reload
+; NOT_WIN-NEXT:    addq $136, %rsp
+; NOT_WIN-NEXT:    retq
   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %c
 }