mirror of
https://github.com/capstone-engine/llvm-capstone.git
synced 2025-01-12 02:47:10 +00:00
[OPENMP]Dynamic globalization for parallel target regions.
Summary: Added support for dynamic memory allocation for globalized variables in case if execution of target regions in parallel is required. Reviewers: jdoerfert Subscribers: jholewinski, yaxunl, guansong, sstefan1, cfe-commits, caomhin Tags: #clang Differential Revision: https://reviews.llvm.org/D82324
This commit is contained in:
parent
f14457f5d8
commit
32ea3397be
@ -231,6 +231,7 @@ LANGOPT(OpenMPCUDANumSMs , 32, 0, "Number of SMs for CUDA devices.")
|
||||
LANGOPT(OpenMPCUDABlocksPerSM , 32, 0, "Number of blocks per SM for CUDA devices.")
|
||||
LANGOPT(OpenMPCUDAReductionBufNum , 32, 1024, "Number of the reduction records in the intermediate reduction buffer used for the teams reductions.")
|
||||
LANGOPT(OpenMPOptimisticCollapse , 1, 0, "Use at most 32 bits to represent the collapsed loop nest counter.")
|
||||
LANGOPT(OpenMPCUDATargetParallel, 1, 0, "Support parallel execution of target region on Cuda-based devices.")
|
||||
LANGOPT(RenderScript , 1, 0, "RenderScript")
|
||||
|
||||
LANGOPT(CUDAIsDevice , 1, 0, "compiling for CUDA device")
|
||||
|
@ -1687,6 +1687,12 @@ def fopenmp_optimistic_collapse : Flag<["-"], "fopenmp-optimistic-collapse">, Gr
|
||||
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>;
|
||||
def fno_openmp_optimistic_collapse : Flag<["-"], "fno-openmp-optimistic-collapse">, Group<f_Group>,
|
||||
Flags<[NoArgumentUnused, HelpHidden]>;
|
||||
def fopenmp_cuda_parallel_target_regions : Flag<["-"], "fopenmp-cuda-parallel-target-regions">, Group<f_Group>,
|
||||
Flags<[CC1Option, NoArgumentUnused, HelpHidden]>,
|
||||
HelpText<"Support parallel execution of target regions on Cuda-based devices.">;
|
||||
def fno_openmp_cuda_parallel_target_regions : Flag<["-"], "fno-openmp-cuda-parallel-target-regions">, Group<f_Group>,
|
||||
Flags<[NoArgumentUnused, HelpHidden]>,
|
||||
HelpText<"Support only serial execution of target regions on Cuda-based devices.">;
|
||||
def static_openmp: Flag<["-"], "static-openmp">,
|
||||
HelpText<"Use the static host OpenMP runtime while linking.">;
|
||||
def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
|
||||
|
@ -85,6 +85,9 @@ enum OpenMPRTLFunctionNVPTX {
|
||||
/// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
|
||||
/// int16_t UseSharedMemory);
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
|
||||
/// Call to void* __kmpc_data_sharing_push_stack(size_t size, int16_t
|
||||
/// UseSharedMemory);
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
|
||||
/// Call to void __kmpc_data_sharing_pop_stack(void *a);
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
|
||||
/// Call to void __kmpc_begin_sharing_variables(void ***args,
|
||||
@ -1753,6 +1756,16 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
|
||||
FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
|
||||
// Build void *__kmpc_data_sharing_push_stack(size_t size, int16_t
|
||||
// UseSharedMemory);
|
||||
llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
|
||||
auto *FnTy =
|
||||
llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
|
||||
RTLFn = CGM.CreateRuntimeFunction(
|
||||
FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
|
||||
break;
|
||||
}
|
||||
case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
|
||||
// Build void __kmpc_data_sharing_pop_stack(void *a);
|
||||
llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
|
||||
@ -2210,7 +2223,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
|
||||
GlobalRecCastAddr = Phi;
|
||||
I->getSecond().GlobalRecordAddr = Phi;
|
||||
I->getSecond().IsInSPMDModeFlag = IsSPMD;
|
||||
} else if (IsInTTDRegion) {
|
||||
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
|
||||
assert(GlobalizedRecords.back().Records.size() < 2 &&
|
||||
"Expected less than 2 globalized records: one for target and one "
|
||||
"for teams.");
|
||||
@ -2283,12 +2296,16 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
|
||||
} else {
|
||||
// TODO: allow the usage of shared memory to be controlled by
|
||||
// the user, for now, default to global.
|
||||
bool UseSharedMemory =
|
||||
IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
|
||||
llvm::Value *GlobalRecordSizeArg[] = {
|
||||
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
|
||||
CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
|
||||
CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
|
||||
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
|
||||
createNVPTXRuntimeFunction(
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
|
||||
IsInTTDRegion
|
||||
? OMPRTL_NVPTX__kmpc_data_sharing_push_stack
|
||||
: OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
|
||||
GlobalRecordSizeArg);
|
||||
GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
|
||||
GlobalRecValue, GlobalRecPtrTy);
|
||||
@ -2435,7 +2452,7 @@ void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
|
||||
OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
|
||||
CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
|
||||
CGF.EmitBlock(ExitBB);
|
||||
} else if (IsInTTDRegion) {
|
||||
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
|
||||
assert(GlobalizedRecords.back().RegionCounter > 0 &&
|
||||
"region counter must be > 0.");
|
||||
--GlobalizedRecords.back().RegionCounter;
|
||||
@ -5085,7 +5102,8 @@ static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
|
||||
}
|
||||
|
||||
void CGOpenMPRuntimeNVPTX::clear() {
|
||||
if (!GlobalizedRecords.empty()) {
|
||||
if (!GlobalizedRecords.empty() &&
|
||||
!CGM.getLangOpts().OpenMPCUDATargetParallel) {
|
||||
ASTContext &C = CGM.getContext();
|
||||
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
|
||||
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
|
||||
|
@ -5257,6 +5257,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
||||
options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
|
||||
CmdArgs.push_back("-fopenmp-cuda-mode");
|
||||
|
||||
// When in OpenMP offloading mode with NVPTX target, forward
|
||||
// cuda-parallel-target-regions flag
|
||||
if (Args.hasFlag(options::OPT_fopenmp_cuda_parallel_target_regions,
|
||||
options::OPT_fno_openmp_cuda_parallel_target_regions,
|
||||
/*Default=*/true))
|
||||
CmdArgs.push_back("-fopenmp-cuda-parallel-target-regions");
|
||||
|
||||
// When in OpenMP offloading mode with NVPTX target, check if full runtime
|
||||
// is required.
|
||||
if (Args.hasFlag(options::OPT_fopenmp_cuda_force_full_runtime,
|
||||
|
@ -3195,6 +3195,12 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
|
||||
Opts.OpenMPCUDAMode = Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
|
||||
Args.hasArg(options::OPT_fopenmp_cuda_mode);
|
||||
|
||||
// Set CUDA support for parallel execution of target regions for OpenMP target
|
||||
// NVPTX/AMDGCN if specified in options.
|
||||
Opts.OpenMPCUDATargetParallel =
|
||||
Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
|
||||
Args.hasArg(options::OPT_fopenmp_cuda_parallel_target_regions);
|
||||
|
||||
// Set CUDA mode for OpenMP target NVPTX/AMDGCN if specified in options
|
||||
Opts.OpenMPCUDAForceFullRuntime =
|
||||
Opts.OpenMPIsDevice && (T.isNVPTX() || T.isAMDGCN()) &&
|
||||
|
@ -2,7 +2,8 @@
|
||||
///==========================================================================///
|
||||
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CK1 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix PAR
|
||||
|
||||
// expected-no-diagnostics
|
||||
|
||||
@ -26,11 +27,11 @@ void test_ds(){
|
||||
}
|
||||
}
|
||||
}
|
||||
// CK1: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CK1-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
|
||||
// CK1-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i64 8
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
/// ========= In the worker function ========= ///
|
||||
// CK1: {{.*}}define internal void @__omp_offloading{{.*}}test_ds{{.*}}_worker()
|
||||
@ -44,11 +45,12 @@ void test_ds(){
|
||||
// CK1: [[SHAREDARGS2:%.+]] = alloca i8**
|
||||
// CK1: call void @__kmpc_kernel_init
|
||||
// CK1: call void @__kmpc_data_sharing_init_stack
|
||||
// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CK1: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
|
||||
// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
|
||||
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i64, i64* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 [[SIZE]], i16 [[SHARED_MEM_FLAG]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i64 0
|
||||
// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 8, i16 1)
|
||||
// CK1: [[GLOBALSTACK2:%.+]] = bitcast i8* [[GLOBALSTACK]] to %struct._globalized_locals_ty*
|
||||
// CK1: [[A:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 0
|
||||
// CK1: [[B:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[GLOBALSTACK2]], i32 0, i32 1
|
||||
@ -75,8 +77,9 @@ void test_ds(){
|
||||
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
||||
// CK1: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
||||
// CK1: call void @__kmpc_end_sharing_variables()
|
||||
// CK1: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CK1: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
|
||||
// SEQ: [[SHARED_MEM_FLAG:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[SHARED_MEM_FLAG]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GLOBALSTACK]])
|
||||
// CK1: call void @__kmpc_kernel_deinit(i16 1)
|
||||
|
||||
/// ========= In the data sharing wrapper function ========= ///
|
||||
|
@ -1,9 +1,12 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
@ -21,19 +24,20 @@ int main(int argc, char **argv) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 0
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 40
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CHECK-DAG: @__omp_offloading_{{.*}}_main_l20_exec_mode = weak constant i8 0
|
||||
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CHECK: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_main_l20([10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}}, i32* nonnull align 4 dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* nonnull align 4 dereferenceable(40) %{{.+}})
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[GEP:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
|
||||
// PAR: [[GEP:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 40, i16 1)
|
||||
// CHECK: [[STACK:%.+]] = bitcast i8* [[GEP]] to %struct._globalized_locals_ty*
|
||||
// CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
|
||||
// CHECK-NOT: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]],
|
||||
@ -43,8 +47,9 @@ int main(int argc, char **argv) {
|
||||
|
||||
// CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
|
||||
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[GEP]])
|
||||
|
||||
// CHECK: define internal void [[PARALLEL]](
|
||||
// CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
|
||||
|
@ -1,9 +1,12 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
@ -72,15 +75,15 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}_worker()
|
||||
// CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker()
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
@ -133,7 +136,7 @@ int bar(int n){
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l26]](i[[SZ:32|64]]
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l29]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
// CHECK-DAG: store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
|
||||
@ -199,7 +202,7 @@ int bar(int n){
|
||||
// CHECK: store i[[SZ]] 44, i[[SZ]]* %a,
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l43}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l46}}_worker()
|
||||
// CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
|
||||
// CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
|
||||
// CHECK: store i8* null, i8** [[OMP_WORK_FN]],
|
||||
@ -243,7 +246,7 @@ int bar(int n){
|
||||
// CHECK: [[EXIT]]
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l43]](i[[SZ:32|64]]
|
||||
// CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l46]](i[[SZ:32|64]]
|
||||
// Create local storage for each capture.
|
||||
// CHECK: [[LOCAL_N:%.+]] = alloca i[[SZ]],
|
||||
// CHECK: [[LOCAL_A:%.+]] = alloca i[[SZ]],
|
||||
@ -323,23 +326,25 @@ int bar(int n){
|
||||
|
||||
// CHECK: declare void @__kmpc_barrier(%struct.ident_t*, i32) #[[#CONVERGENT]]
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l55}}(
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l58}}(
|
||||
// CHECK-32: [[A_ADDR:%.+]] = alloca i32,
|
||||
// CHECK-64: [[A_ADDR:%.+]] = alloca i64,
|
||||
// CHECK-64: [[CONV:%.+]] = bitcast i64* [[A_ADDR]] to i32*
|
||||
// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CHECK: [[BC:%.+]] = bitcast i8* [[STACK]] to %struct._globalized_locals_ty*
|
||||
// CHECK-32: [[A:%.+]] = load i32, i32* [[A_ADDR]],
|
||||
// CHECK-64: [[A:%.+]] = load i32, i32* [[CONV]],
|
||||
// CHECK: [[GLOBAL_A_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
// CHECK: store i32 [[A]], i32* [[GLOBAL_A_ADDR]],
|
||||
// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]])
|
||||
|
||||
// CHECK-LABEL: define internal void @{{.+}}(i32* noalias %{{.+}}, i32* noalias %{{.+}}, i32* nonnull align {{[0-9]+}} dereferenceable{{.*}})
|
||||
// CHECK: [[CC:%.+]] = alloca i32,
|
||||
|
@ -1,6 +1,7 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
@ -30,33 +31,35 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l12}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l13}}_worker()
|
||||
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
||||
// CHECK: call i1 @__kmpc_kernel_parallel(
|
||||
// CHECK: call void @__omp_outlined___wrapper(
|
||||
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}l12(
|
||||
// CHECK: call void @__omp_offloading_{{.*}}l12_worker()
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}l13(
|
||||
// CHECK: call void @__omp_offloading_{{.*}}l13_worker()
|
||||
// CHECK: call void @__kmpc_kernel_init(
|
||||
// CHECK: call void @__kmpc_data_sharing_init_stack()
|
||||
// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CHECK: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CHECK: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i64 %7, i16 %6, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[STACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR: [[STACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CHECK: call void @__kmpc_kernel_prepare_parallel(
|
||||
// CHECK: call void @__kmpc_begin_sharing_variables({{.*}}, i64 2)
|
||||
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
||||
// CHECK: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
|
||||
// CHECK: call void @__kmpc_end_sharing_variables()
|
||||
// CHECK: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 0, i16 [[IS_SHARED]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[STACK]])
|
||||
// CHECK: call void @__kmpc_kernel_deinit(i16 1)
|
||||
|
||||
// CHECK: define internal void @__omp_outlined__(
|
||||
|
@ -1,17 +1,20 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
// CHECK: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: {{@__omp_offloading_.+}}_l20_exec_mode = weak constant i8 1
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: {{@__omp_offloading_.+}}_l23_exec_mode = weak constant i8 1
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
template<typename tx>
|
||||
tx ftemplate(int n) {
|
||||
@ -35,10 +38,10 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l20}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l23}}_worker()
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l20}}()
|
||||
// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l23}}()
|
||||
|
||||
// CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
|
||||
// CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
@ -48,7 +51,7 @@ int bar(int n){
|
||||
// CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
|
||||
//
|
||||
// CHECK: [[WORKER]]
|
||||
// CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l20}}_worker()
|
||||
// CHECK: {{call|invoke}} void {{@__omp_offloading_.+template.+l23}}_worker()
|
||||
// CHECK: br label {{%?}}[[EXIT:.+]]
|
||||
//
|
||||
// CHECK: [[CHECK_MASTER]]
|
||||
@ -75,11 +78,12 @@ int bar(int n){
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define internal void [[PARALLEL]](i32* noalias %{{.+}}, i32* noalias %{{.+}})
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**))
|
||||
// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]],
|
||||
// CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* @{{.+}}, i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[BUF:@.+]] to i8**))
|
||||
// SEQ: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[BUF]],
|
||||
// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[PTR]], i{{64|32}} 0
|
||||
// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CHECK: [[RD:%.+]] = bitcast i8* [[ADDR]] to [[GLOB_TY:%.+]]*
|
||||
// CHECK: [[I_ADDR:%.+]] = getelementptr inbounds [[GLOB_TY]], [[GLOB_TY]]* [[RD]], i32 0, i32 0
|
||||
//
|
||||
|
@ -1,21 +1,25 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix CHECK-DIV64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -fopenmp-optimistic-collapse -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-DIV32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
// Check that the execution mode of all 5 target regions on the gpu is set to SPMD Mode.
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l34}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l45}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l58}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l65}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l38}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l62}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l69}}_exec_mode = weak constant i8 0
|
||||
|
||||
#define N 1000
|
||||
#define M 10
|
||||
@ -76,31 +80,33 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l34(
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l38(
|
||||
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
|
||||
// CHECK: call void [[PARALLEL:@.+]](
|
||||
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
|
||||
|
||||
// CHECK: define internal void [[PARALLEL]](
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CHECK: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[ADDR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
|
||||
// PAR: [[ADDR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CHECK: [[BC:%.+]] = bitcast i8* [[ADDR]] to [[REC:%.+]]*
|
||||
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
|
||||
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[ADDR]])
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define internal void [[OUTL1]](
|
||||
@ -233,13 +239,13 @@ int bar(int n){
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_l58(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_l62(i[[SZ:64|32]] %{{[^,]+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
|
||||
// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [10 x [10 x i32]]* %{{.*}})
|
||||
// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [10 x [10 x i32]]* nonnull align {{[0-9]+}} dereferenceable{{.*}})
|
||||
// CHECK-DIV64: div i64
|
||||
// CHECK-DIV32-NO: div i64
|
||||
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_l65(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}})
|
||||
// CHECK: define weak void @__omp_offloading_{{.*}}_l69(i[[SZ:64|32]] %{{[^,]+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{[^)]+}})
|
||||
// CHECK: call void [[OUTLINED:@__omp_outlined.*]](i32* %{{.+}}, i32* %{{.+}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, i[[SZ]] %{{.*}}, [1000 x i32]* %{{.*}}, i32* %{{.*}})
|
||||
// CHECK: define internal void [[OUTLINED]](i32* noalias %{{.*}}, i32* noalias %{{.*}} i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, i[[SZ]] %{{.+}}, [1000 x i32]* nonnull align {{[0-9]+}} dereferenceable{{.*}}, i32* %{{.*}})
|
||||
|
||||
|
@ -1,18 +1,21 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fopenmp-cuda-mode -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
|
||||
// Check that the execution mode of all 4 target regions on the gpu is set to SPMD Mode.
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l30}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l36}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l46}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l39}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l49}}_exec_mode = weak constant i8 0
|
||||
|
||||
#define N 1000
|
||||
#define M 10
|
||||
@ -62,29 +65,31 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CHECK-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ-DAG: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SHARED:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l30(
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+}}_l33(
|
||||
// CHECK-DAG: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
|
||||
// CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]], i16 0, i16 0)
|
||||
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 0)
|
||||
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// CHECK: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CHECK: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CHECK: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 1, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[TEAM_ALLOC:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[PTR:%.+]] = getelementptr inbounds i8, i8* [[TEAM_ALLOC]], i{{64|32}} 0
|
||||
// PAR: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CHECK: [[BC:%.+]] = bitcast i8* [[PTR]] to [[REC:%.+]]*
|
||||
// CHECK: getelementptr inbounds [[REC]], [[REC]]* [[BC]], i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
// CHECK: call void @__kmpc_for_static_init_4({{.+}}, {{.+}}, {{.+}} 91,
|
||||
// CHECK: {{call|invoke}} void [[OUTL1:@.+]](
|
||||
// CHECK: call void @__kmpc_for_static_fini(
|
||||
// CHECK: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// CHECK: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// SEQ: [[SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED]],
|
||||
// SEQ: call void @__kmpc_restore_team_static_memory(i16 1, i16 [[SHARED]])
|
||||
// PAR: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]])
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK: define internal void [[OUTL1]](
|
||||
|
@ -1,8 +1,10 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK1 --check-prefix CK1-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
@ -27,13 +29,13 @@ int main (int argc, char **argv) {
|
||||
return tmain(argv);
|
||||
}
|
||||
|
||||
// CK1: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CK1-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CK1-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CK1-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CK1-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
|
||||
// CK1-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CK1-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
|
||||
// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// only nvptx side: do not outline teams region and do not call fork_teams
|
||||
// CK1: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[ARGC:%.+]])
|
||||
@ -41,11 +43,12 @@ int main (int argc, char **argv) {
|
||||
// CK1: store {{.+}} 0, {{.+}},
|
||||
// CK1: store i{{[0-9]+}} [[ARGC]], i{{[0-9]+}}* [[ARGCADDR]],
|
||||
// CK1-64: [[CONV:%.+]] = bitcast i{{[0-9]+}}* [[ARGCADDR]] to i{{[0-9]+}}*
|
||||
// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
|
||||
// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
|
||||
// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CK1-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
|
||||
// CK1-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
|
||||
// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
@ -62,11 +65,12 @@ int main (int argc, char **argv) {
|
||||
// CK1: define {{.*}}void @{{[^,]+}}(i{{.+}}** [[ARGC:%.+]])
|
||||
// CK1: [[ARGCADDR:%.+]] = alloca i{{.+}}**,
|
||||
// CK1: store i{{.+}}** [[ARGC]], i{{.+}}*** [[ARGCADDR]]
|
||||
// CK1: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
|
||||
// CK1: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
|
||||
// CK1: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CK1: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CK1: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
|
||||
// SEQ: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
|
||||
// SEQ: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1)
|
||||
// CK1: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
|
||||
// CK1: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
// CK1: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
|
||||
@ -83,9 +87,11 @@ int main (int argc, char **argv) {
|
||||
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix SEQ2
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-64 --check-prefix PAR2
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix SEQ2
|
||||
// RUN: %clang_cc1 -DCK2 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CK2 --check-prefix CK2-32 --check-prefix PAR2
|
||||
// expected-no-diagnostics
|
||||
#ifdef CK2
|
||||
|
||||
@ -112,13 +118,13 @@ int main (int argc, char **argv) {
|
||||
return tmain(argv);
|
||||
}
|
||||
|
||||
// CK2: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// CK2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// CK2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CK2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// CK2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
|
||||
// CK2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CK2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ2: [[MEM_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ2-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
|
||||
// SEQ2-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ2-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} 4
|
||||
// SEQ2-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} {{8|4}}
|
||||
// SEQ2-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ2-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
|
||||
// CK2: define {{.*}}void @{{[^,]+}}(i{{[0-9]+}} [[A_IN:%.+]], i{{[0-9]+}} [[B_IN:%.+]], i{{[0-9]+}} [[ARGC_IN:.+]])
|
||||
// CK2: [[AADDR:%.+]] = alloca i{{[0-9]+}},
|
||||
@ -130,11 +136,12 @@ int main (int argc, char **argv) {
|
||||
// CK2-64: [[ACONV:%.+]] = bitcast i64* [[AADDR]] to i32*
|
||||
// CK2-64: [[BCONV:%.+]] = bitcast i64* [[BADDR]] to i32*
|
||||
// CK2-64: [[CONV:%.+]] = bitcast i64* [[ARGCADDR]] to i32*
|
||||
// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
|
||||
// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
|
||||
// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED1]],
|
||||
// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE1]],
|
||||
// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} 4, i16 1)
|
||||
// CK2-64: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[CONV]]
|
||||
// CK2-32: [[ARG:%.+]] = load i{{[0-9]+}}, i{{[0-9]+}}* [[ARGCADDR]]
|
||||
// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
@ -155,11 +162,12 @@ int main (int argc, char **argv) {
|
||||
// CK2: store i{{[0-9]+}} [[A_IN]], i{{[0-9]+}}* [[AADDR]],
|
||||
// CK2: store i{{[0-9]+}} [[B_IN]], i{{[0-9]+}}* [[BADDR]],
|
||||
// CK2: store i{{[0-9]+}}** [[ARGC]], i{{[0-9]+}}*** [[ARGCADDR]],
|
||||
// CK2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
|
||||
// CK2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
|
||||
// CK2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// CK2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// CK2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// SEQ2: [[IS_SHARED:%.+]] = load i16, i16* [[KERNEL_SHARED2]],
|
||||
// SEQ2: [[SIZE:%.+]] = load i{{64|32}}, i{{64|32}}* [[KERNEL_SIZE2]],
|
||||
// SEQ2: call void @__kmpc_get_team_static_memory(i16 0, i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} [[SIZE]], i16 [[IS_SHARED]], i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
|
||||
// SEQ2: [[KERNEL_RD:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
|
||||
// SEQ2: [[GLOBALSTACK:%.+]] = getelementptr inbounds i8, i8* [[KERNEL_RD]], i{{64|32}} 0
|
||||
// PAR2: [[GLOBALSTACK:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{32|64}} {{4|8}}, i16 1)
|
||||
// CK2: [[ARG:%.+]] = load i{{[0-9]+}}**, i{{[0-9]+}}*** [[ARGCADDR]]
|
||||
// CK2: [[ARGCADDR:%.+]] = getelementptr inbounds %struct.{{.*}}, %struct.{{.*}}* %{{.*}}, i{{[0-9]+}} 0, i{{[0-9]+}} 0
|
||||
// CK2: store i{{[0-9]+}}** [[ARG]], i{{[0-9]+}}*** [[ARGCADDR]],
|
||||
|
@ -1,9 +1,12 @@
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix SEQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -fopenmp-cuda-teams-reduction-recs-num=2048 -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 --check-prefix PAR
|
||||
// expected-no-diagnostics
|
||||
#ifndef HEADER
|
||||
#define HEADER
|
||||
@ -12,21 +15,21 @@
|
||||
// CHECK-DAG: [[TEAM2_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i8], [{{1024|2048}} x float] }
|
||||
// CHECK-DAG: [[TEAM3_REDUCE_TY:%.+]] = type { [{{1024|2048}} x i32], [{{1024|2048}} x i16] }
|
||||
// CHECK-DAG: [[TEAMS_REDUCE_UNION_TY:%.+]] = type { [[TEAM1_REDUCE_TY]] }
|
||||
// CHECK-DAG: [[MAP_TY:%.+]] = type { [128 x i8] }
|
||||
// SEQ-DAG: [[MAP_TY:%.+]] = type { [128 x i8] }
|
||||
|
||||
// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// CHECK-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CHECK-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
// CHECK-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
|
||||
// CHECK-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
|
||||
// SEQ-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
|
||||
// SEQ-DAG: [[KERNEL_SHARED1:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ-DAG: [[KERNEL_SHARED2:@.+]] = internal unnamed_addr constant i16 1
|
||||
// SEQ-DAG: [[KERNEL_SIZE1:@.+]] = internal unnamed_addr constant i{{64|32}} {{16|8}}
|
||||
// SEQ-DAG: [[KERNEL_SIZE2:@.+]] = internal unnamed_addr constant i{{64|32}} 16
|
||||
|
||||
// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp.
|
||||
// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i32]
|
||||
|
||||
// Check that the execution mode of 2 target regions is set to Non-SPMD and the 3rd is in SPMD.
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l41}}_exec_mode = weak constant i8 1
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l47}}_exec_mode = weak constant i8 1
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l54}}_exec_mode = weak constant i8 0
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l44}}_exec_mode = weak constant i8 1
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l50}}_exec_mode = weak constant i8 1
|
||||
// CHECK-DAG: {{@__omp_offloading_.+l57}}_exec_mode = weak constant i8 0
|
||||
|
||||
// CHECK-DAG: [[TEAMS_RED_BUFFER:@.+]] = internal global [[TEAMS_REDUCE_UNION_TY]] zeroinitializer
|
||||
|
||||
@ -70,9 +73,9 @@ int bar(int n){
|
||||
return a;
|
||||
}
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l41}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l44}}_worker()
|
||||
|
||||
// CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l41]](
|
||||
// CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l44]](
|
||||
//
|
||||
// CHECK: {{call|invoke}} void [[T1]]_worker()
|
||||
//
|
||||
@ -337,9 +340,9 @@ int bar(int n){
|
||||
// CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l47}}_worker()
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l50}}_worker()
|
||||
|
||||
// CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l47]](
|
||||
// CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l50]](
|
||||
//
|
||||
// CHECK: {{call|invoke}} void [[T2]]_worker()
|
||||
|
||||
@ -704,13 +707,13 @@ int bar(int n){
|
||||
// CHECK: call void [[REDUCTION_FUNC]](i8* [[RL_BC]], i8* [[LOCAL_RL_BC]])
|
||||
// CHECK: ret void
|
||||
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l54}}(
|
||||
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l57}}(
|
||||
//
|
||||
// CHECK: call void @__kmpc_spmd_kernel_init(
|
||||
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd()
|
||||
// CHECK: call void @__kmpc_spmd_kernel_deinit_v2(i16 1)
|
||||
|
||||
// CHECK-NOT: call void @__kmpc_get_team_static_memory
|
||||
// CHECK-NOT: call void @{{__kmpc_get_team_static_memory|__kmpc_data_sharing_push_stack}}
|
||||
// CHECK: store i32 0,
|
||||
// CHECK: store i32 0,
|
||||
// CHECK: store i32 0, i32* [[A_ADDR:%.+]], align
|
||||
|
Loading…
x
Reference in New Issue
Block a user