llvm-capstone/clang/test/OpenMP/nvptx_parallel_for_codegen.cpp
JonChesterfield 5d02ca49a2 [libomptarget][nvptx] Undef, weak shared variables
[libomptarget][nvptx] Undef, weak shared variables

Shared variables on nvptx, and LDS on amdgcn, are uninitialized at
the start of kernel execution. Therefore create the variables with
undef instead of zeros, motivated in part by the amdgcn back end
rejecting LDS+initializer.

Common is zero initialized, which seems incompatible with shared. Thus
change them to weak, following the direction of
https://reviews.llvm.org/rG7b3eabdcd215

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D90248
2020-10-28 14:25:36 +00:00

138 lines
5.9 KiB
C++

// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
template<typename tx>
tx ftemplate(int n) {
tx b[10];
#pragma omp target
{
tx d = n;
#pragma omp parallel for
for(int i=0; i<10; i++) {
b[i] += d;
}
b[3] += 1;
}
return b[3];
}
int bar(int n){
int a = 0;
a += ftemplate<int>(n);
return a;
}
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker()
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK: call i1 @__kmpc_kernel_parallel(
// CHECK: call void @__omp_outlined___wrapper(
// CHECK: define weak void @__omp_offloading_{{.*}}l13(
// CHECK: call void @__omp_offloading_{{.*}}l13_worker()
// CHECK: call void @__kmpc_kernel_init(
// CHECK: call void @__kmpc_data_sharing_init_stack()
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
// CHECK: call void @__kmpc_kernel_prepare_parallel(
// CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2)
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK: call void @__kmpc_end_sharing_variables()
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]])
// CHECK: call void @__kmpc_kernel_deinit(i16 1)
// CHECK: define internal void @__omp_outlined__(
// CHECK: alloca
// CHECK: alloca
// CHECK: alloca
// CHECK: alloca
// CHECK: [[OMP_IV:%.*]] = alloca i32
// CHECK: store i32 0, {{.*}} [[OMP_LB:%.+]],
// CHECK: store i32 9, {{.*}} [[OMP_UB:%.+]],
// CHECK: store i32 1, {{.*}} [[OMP_ST:%.+]],
// CHECK: call void @__kmpc_for_static_init_4({{.*}} i32 33, {{.*}} [[OMP_LB]], {{.*}} [[OMP_UB]], {{.*}} [[OMP_ST]], i32 1, i32 1)
// CHECK: br label %[[OMP_DISPATCH_COND:.+]]
// CHECK: [[OMP_DISPATCH_COND]]
// CHECK: [[OMP_UB_1:%.+]] = load {{.*}} [[OMP_UB]]
// CHECK: [[COMP_1:%.+]] = icmp sgt {{.*}} [[OMP_UB_1]]
// CHECK: br i1 [[COMP_1]], label %[[COND_TRUE:.+]], label %[[COND_FALSE:.+]]
// CHECK: [[COND_TRUE]]
// CHECK: br label %[[COND_END:.+]]
// CHECK: [[COND_FALSE]]
// CHECK: [[OMP_UB_2:%.+]] = load {{.*}}* [[OMP_UB]]
// CHECK: br label %[[COND_END]]
// CHECK: [[COND_END]]
// CHECK: [[COND_RES:%.+]] = phi i32 [ 9, %[[COND_TRUE]] ], [ [[OMP_UB_2]], %[[COND_FALSE]] ]
// CHECK: store i32 [[COND_RES]], i32* [[OMP_UB]]
// CHECK: [[OMP_LB_1:%.+]] = load i32, i32* [[OMP_LB]]
// CHECK: store i32 [[OMP_LB_1]], i32* [[OMP_IV]]
// CHECK: [[OMP_IV_1:%.+]] = load i32, i32* [[OMP_IV]]
// CHECK: [[OMP_UB_3:%.+]] = load i32, i32* [[OMP_UB]]
// CHECK: [[COMP_2:%.+]] = icmp sle i32 [[OMP_IV_1]], [[OMP_UB_3]]
// CHECK: br i1 [[COMP_2]], label %[[DISPATCH_BODY:.+]], label %[[DISPATCH_END:.+]]
// CHECK: [[DISPATCH_BODY]]
// CHECK: br label %[[OMP_INNER_FOR_COND:.+]]
// CHECK: [[OMP_INNER_FOR_COND]]
// CHECK: [[OMP_IV_2:%.+]] = load i32, i32* [[OMP_IV]]
// CHECK: [[OMP_UB_4:%.+]] = load i32, i32* [[OMP_UB]]
// CHECK: [[COMP_3:%.+]] = icmp sle i32 [[OMP_IV_2]], [[OMP_UB_4]]
// CHECK: br i1 [[COMP_3]], label %[[OMP_INNER_FOR_BODY:.+]], label %[[OMP_INNER_FOR_END:.+]]
// CHECK: [[OMP_INNER_FOR_BODY]]
// CHECK: br label %[[OMP_BODY_CONTINUE:.+]]
// CHECK: [[OMP_BODY_CONTINUE]]
// CHECK: br label %[[OMP_INNER_FOR_INC:.+]]
// CHECK: [[OMP_INNER_FOR_INC]]
// CHECK: [[OMP_IV_3:%.+]] = load i32, i32* [[OMP_IV]]
// CHECK: [[ADD_1:%.+]] = add nsw i32 [[OMP_IV_3]], 1
// CHECK: store i32 [[ADD_1]], i32* [[OMP_IV]]
// CHECK: br label %[[OMP_INNER_FOR_COND]]
// CHECK: [[OMP_INNER_FOR_COND]]
// CHECK: br label %[[OMP_DISPATCH_INC:.+]]
// CHECK: [[OMP_DISPATCH_INC]]
// CHECK: [[OMP_LB_2:%.+]] = load i32, i32* [[OMP_LB]]
// CHECK: [[OMP_ST_1:%.+]] = load i32, i32* [[OMP_ST]]
// CHECK: [[ADD_2:%.+]] = add nsw i32 [[OMP_LB_2]], [[OMP_ST_1]]
// CHECK: store i32 [[ADD_2]], i32* [[OMP_LB]]
// CHECK: [[OMP_UB_5:%.+]] = load i32, i32* [[OMP_UB]]
// CHECK: [[OMP_ST_2:%.+]] = load i32, i32* [[OMP_ST]]
// CHECK: [[ADD_3:%.+]] = add nsw i32 [[OMP_UB_5]], [[OMP_ST_2]]
// CHECK: store i32 [[ADD_3]], i32* [[OMP_UB]]
// CHECK: [[DISPATCH_END]]
// CHECK: call void @__kmpc_for_static_fini(
// CHECK: ret void
#endif