llvm-capstone/clang/test/OpenMP/nvptx_data_sharing.cpp
JonChesterfield 5d02ca49a2 [libomptarget][nvptx] Undef, weak shared variables
[libomptarget][nvptx] Undef, weak shared variables

Shared variables on nvptx, and LDS on amdgcn, are uninitialized at
the start of kernel execution. Therefore create the variables with
undef instead of zeros, motivated in part by the amdgcn back end
rejecting LDS+initializer.

Common is zero initialized, which seems incompatible with shared. Thus
change them to weak, following the direction of
https://reviews.llvm.org/rG7b3eabdcd215

Reviewed By: jdoerfert

Differential Revision: https://reviews.llvm.org/D90248
2020-10-28 14:25:36 +00:00

120 lines
6.5 KiB
C++

// Test device global memory data sharing codegen.
///==========================================================================///
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
void test_ds(){
#pragma omp target
{
int a = 10;
#pragma omp parallel
{
a = 1000;
}
int b = 100;
int c = 1000;
#pragma omp parallel private(c)
{
int *c1 = &c;
b = a + 10000;
}
}
}
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] undef
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* undef
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
/// ========= In the worker function ========= ///
// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1-NOT: call void @__kmpc_data_sharing_init_stack
/// ========= In the kernel function ========= ///
// CK1: {{.*}}define weak void @__omp_offloading{{.*}}test_ds{{.*}}()
// CK1: [[SHAREDARGS1:%.+]] = alloca i8**
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
// CK1: call void @__kmpc_kernel_init
// CK1: call void @__kmpc_data_sharing_init_stack
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1)
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
// CK1: store i32 10, i32* [[A]]
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}})
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS1]], i64 1)
// CK1: [[SHARGSTMP1:%.+]] = load i8**, i8*** [[SHAREDARGS1]]
// CK1: [[SHARGSTMP2:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP1]], i64 0
// CK1: [[SHAREDVAR:%.+]] = bitcast i32* [[A]] to i8*
// CK1: store i8* [[SHAREDVAR]], i8** [[SHARGSTMP2]]
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_end_sharing_variables()
// CK1: store i32 100, i32* [[B]]
// CK1: call void @__kmpc_kernel_prepare_parallel({{.*}})
// CK1: call void @__kmpc_begin_sharing_variables(i8*** [[SHAREDARGS2]], i64 2)
// CK1: [[SHARGSTMP3:%.+]] = load i8**, i8*** [[SHAREDARGS2]]
// CK1: [[SHARGSTMP4:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 0
// CK1: [[SHAREDVAR1:%.+]] = bitcast i32* [[B]] to i8*
// CK1: store i8* [[SHAREDVAR1]], i8** [[SHARGSTMP4]]
// CK1: [[SHARGSTMP12:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP3]], i64 1
// CK1: [[SHAREDVAR2:%.+]] = bitcast i32* [[A]] to i8*
// CK1: store i8* [[SHAREDVAR2]], i8** [[SHARGSTMP12]]
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CK1: call void @__kmpc_end_sharing_variables()
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
// CK1: call void @__kmpc_kernel_deinit(i16 1)
/// ========= In the data sharing wrapper function ========= ///
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
// CK1: [[SHAREDARGS4:%.+]] = alloca i8**
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS4]])
// CK1: [[SHARGSTMP13:%.+]] = load i8**, i8*** [[SHAREDARGS4]]
// CK1: [[SHARGSTMP14:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP13]], i64 0
// CK1: [[SHARGSTMP15:%.+]] = bitcast i8** [[SHARGSTMP14]] to i32**
// CK1: [[SHARGSTMP16:%.+]] = load i32*, i32** [[SHARGSTMP15]]
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP16]])
/// outlined function for the second parallel region ///
// CK1: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.+}})
// CK1-NOT: call i8* @__kmpc_data_sharing_push_stack(
// CK1: [[C_ADDR:%.+]] = alloca i32,
// CK1: store i32* [[C_ADDR]], i32** %
// CK1i-NOT: call void @__kmpc_data_sharing_pop_stack(
/// ========= In the data sharing wrapper function ========= ///
// CK1: {{.*}}define internal void @__omp_outlined{{.*}}wrapper({{.*}})
// CK1: [[SHAREDARGS3:%.+]] = alloca i8**
// CK1: call void @__kmpc_get_shared_variables(i8*** [[SHAREDARGS3]])
// CK1: [[SHARGSTMP5:%.+]] = load i8**, i8*** [[SHAREDARGS3]]
// CK1: [[SHARGSTMP6:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 0
// CK1: [[SHARGSTMP7:%.+]] = bitcast i8** [[SHARGSTMP6]] to i32**
// CK1: [[SHARGSTMP8:%.+]] = load i32*, i32** [[SHARGSTMP7]]
// CK1: [[SHARGSTMP9:%.+]] = getelementptr inbounds i8*, i8** [[SHARGSTMP5]], i64 1
// CK1: [[SHARGSTMP10:%.+]] = bitcast i8** [[SHARGSTMP9]] to i32**
// CK1: [[SHARGSTMP11:%.+]] = load i32*, i32** [[SHARGSTMP10]]
// CK1: call void @__omp_outlined__{{.*}}({{.*}}, i32* [[SHARGSTMP8]], i32* [[SHARGSTMP11]])
#endif