[OpenMP][FIX] Allocate per launch memory for GPU team reductions (#70752)

We used to perform team reduction on global memory allocated in the
runtime and by clang. This was racy as multiple instances of a kernel,
or different kernels with team reductions, would use the same locations.
Since we now have the kernel launch environment, we can allocate dynamic
memory per-launch, allowing us to move all the state into a non-racy
place.

Fixes: https://github.com/llvm/llvm-project/issues/70249
This commit is contained in:
Johannes Doerfert 2023-11-01 11:11:48 -07:00 committed by GitHub
parent 0d3377c496
commit f9a89e6b9c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 231 additions and 195 deletions

View File

@ -803,8 +803,30 @@ void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
if (!IsSPMD)
emitGenericVarsEpilog(CGF);
// This is temporary until we remove the fixed sized buffer.
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
QualType RecTy = C.getRecordType(TeamReductionRec);
auto *Field = FieldDecl::Create(
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
StaticRD->addDecl(Field);
}
StaticRD->completeDefinition();
QualType StaticTy = C.getRecordType(StaticRD);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
const auto &DL = CGM.getModule().getDataLayout();
uint64_t BufferSize =
DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
CGBuilderTy &Bld = CGF.Builder;
OMPBuilder.createTargetDeinit(Bld);
OMPBuilder.createTargetDeinit(Bld, BufferSize);
}
void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
@ -2998,15 +3020,10 @@ void CGOpenMPRuntimeGPU::emitReduction(
CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap,
C.getLangOpts().OpenMPCUDAReductionBufNum);
TeamsReductions.push_back(TeamReductionRec);
if (!KernelTeamsReductionPtr) {
KernelTeamsReductionPtr = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_teams_reductions_buffer_$_$ptr");
}
llvm::Value *GlobalBufferPtr = CGF.EmitLoadOfScalar(
Address(KernelTeamsReductionPtr, CGF.VoidPtrTy, CGM.getPointerAlign()),
/*Volatile=*/false, C.getPointerType(C.VoidPtrTy), Loc);
auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
{}, "_openmp_teams_reductions_buffer_$_$ptr");
llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
CGM, Privates, ReductionArrayTy, Loc, TeamReductionRec, VarFieldMap);
llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
@ -3021,7 +3038,7 @@ void CGOpenMPRuntimeGPU::emitReduction(
llvm::Value *Args[] = {
RTLoc,
ThreadId,
GlobalBufferPtr,
KernelTeamsReductionPtr,
CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
RL,
ShuffleAndReduceFn,
@ -3654,42 +3671,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}
void CGOpenMPRuntimeGPU::clear() {
if (!TeamsReductions.empty()) {
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_teams_reduction_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
for (const RecordDecl *TeamReductionRec : TeamsReductions) {
QualType RecTy = C.getRecordType(TeamReductionRec);
auto *Field = FieldDecl::Create(
C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
StaticRD->addDecl(Field);
}
StaticRD->completeDefinition();
QualType StaticTy = C.getRecordType(StaticRD);
llvm::Type *LLVMReductionsBufferTy =
CGM.getTypes().ConvertTypeForMem(StaticTy);
// FIXME: nvlink does not handle weak linkage correctly (object with the
// different size are reported as erroneous).
// Restore CommonLinkage as soon as nvlink is fixed.
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMReductionsBufferTy,
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(LLVMReductionsBufferTy),
"_openmp_teams_reductions_buffer_$_");
KernelTeamsReductionPtr->setInitializer(
llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
CGM.VoidPtrTy));
}
CGOpenMPRuntime::clear();
}
llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {
CGBuilderTy &Bld = CGF.Builder;
llvm::Module *M = &CGF.CGM.getModule();

View File

@ -130,7 +130,6 @@ protected:
public:
explicit CGOpenMPRuntimeGPU(CodeGenModule &CGM);
void clear() override;
bool isGPU() const override { return true; };
@ -386,7 +385,6 @@ private:
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams
/// reductions.

View File

@ -97,14 +97,14 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: store ptr [[E1]], ptr [[TMP4]], align 8
// CHECK1-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
// CHECK1-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
// CHECK1-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK1-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
// CHECK1-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK1-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
// CHECK1-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK1-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -386,21 +386,21 @@ int bar(int n){
// CHECK1-NEXT: store ptr [[C1]], ptr [[TMP6]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: store ptr [[D2]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
// CHECK1-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
// CHECK1-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK1-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
// CHECK1-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
// CHECK1-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK1-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK1-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK1-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK1-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
// CHECK1-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK1-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
// CHECK1-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK1-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -727,29 +727,29 @@ int bar(int n){
// CHECK1-NEXT: store ptr [[A1]], ptr [[TMP6]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: store ptr [[B2]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 8
// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK1-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK1-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK1-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK1-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK1: .omp.reduction.then:
// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK1-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
// CHECK1-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK1-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
// CHECK1-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK1-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
// CHECK1-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK1-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
// CHECK1-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK1-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK1-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK1: cond.true:
// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK1-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK1-NEXT: br label [[COND_END:%.*]]
// CHECK1: cond.false:
// CHECK1-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
// CHECK1-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK1-NEXT: br label [[COND_END]]
// CHECK1: cond.end:
// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
// CHECK1-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK1-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK1-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK1-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -1157,13 +1157,13 @@ int bar(int n){
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@ -1183,11 +1183,11 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@ -1209,13 +1209,13 @@ int bar(int n){
// CHECK1-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 0
// CHECK1-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 8
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK1-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK1-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i64 0, i64 1
// CHECK1-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 8
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK1-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK1-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK1-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@ -1235,11 +1235,11 @@ int bar(int n){
// CHECK1-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK1-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK1-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 8
// CHECK1-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i64 0, i64 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK1-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK1-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 8
// CHECK1-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 8
@ -1294,14 +1294,14 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK2-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK2-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
// CHECK2-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK2-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
// CHECK2-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK2-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
// CHECK2-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK2-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
// CHECK2-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK2-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -1583,21 +1583,21 @@ int bar(int n){
// CHECK2-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
// CHECK2-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
// CHECK2-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK2-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
// CHECK2-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
// CHECK2-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK2-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK2-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK2-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK2-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
// CHECK2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK2-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
// CHECK2-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK2-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -1924,29 +1924,29 @@ int bar(int n){
// CHECK2-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK2-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK2-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK2-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK2-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK2-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK2-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK2: .omp.reduction.then:
// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK2-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
// CHECK2-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK2-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
// CHECK2-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK2-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
// CHECK2-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK2-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
// CHECK2-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK2-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK2-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK2: cond.true:
// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK2-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK2-NEXT: br label [[COND_END:%.*]]
// CHECK2: cond.false:
// CHECK2-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
// CHECK2-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK2-NEXT: br label [[COND_END]]
// CHECK2: cond.end:
// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
// CHECK2-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK2-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK2-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK2-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -2354,13 +2354,13 @@ int bar(int n){
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@ -2380,11 +2380,11 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@ -2406,13 +2406,13 @@ int bar(int n){
// CHECK2-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK2-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK2-NEXT: [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK2-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK2-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@ -2432,11 +2432,11 @@ int bar(int n){
// CHECK2-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK2-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK2-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK2-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK2-NEXT: [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK2-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK2-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@ -2491,14 +2491,14 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK3-NEXT: store ptr [[E1]], ptr [[TMP4]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr [[TMP5]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK3-NEXT: [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
// CHECK3-NEXT: br i1 [[TMP7]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP3]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func, ptr @_omp_reduction_inter_warp_copy_func, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// CHECK3-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 1
// CHECK3-NEXT: br i1 [[TMP6]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK3-NEXT: [[TMP9:%.*]] = load double, ptr [[E1]], align 8
// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP8]], [[TMP9]]
// CHECK3-NEXT: [[TMP7:%.*]] = load double, ptr [[TMP0]], align 8
// CHECK3-NEXT: [[TMP8:%.*]] = load double, ptr [[E1]], align 8
// CHECK3-NEXT: [[ADD2:%.*]] = fadd double [[TMP7]], [[TMP8]]
// CHECK3-NEXT: store double [[ADD2]], ptr [[TMP0]], align 8
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP3]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -2780,21 +2780,21 @@ int bar(int n){
// CHECK3-NEXT: store ptr [[C1]], ptr [[TMP6]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: store ptr [[D2]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func1, ptr @_omp_reduction_inter_warp_copy_func2, ptr @_omp_reduction_list_to_global_copy_func3, ptr @_omp_reduction_list_to_global_reduce_func4, ptr @_omp_reduction_global_to_list_copy_func5, ptr @_omp_reduction_global_to_list_reduce_func6)
// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP11]] to i32
// CHECK3-NEXT: [[TMP12:%.*]] = load i8, ptr [[C1]], align 1
// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP12]] to i32
// CHECK3-NEXT: [[TMP10:%.*]] = load i8, ptr [[TMP0]], align 1
// CHECK3-NEXT: [[CONV4:%.*]] = sext i8 [[TMP10]] to i32
// CHECK3-NEXT: [[TMP11:%.*]] = load i8, ptr [[C1]], align 1
// CHECK3-NEXT: [[CONV5:%.*]] = sext i8 [[TMP11]] to i32
// CHECK3-NEXT: [[XOR6:%.*]] = xor i32 [[CONV4]], [[CONV5]]
// CHECK3-NEXT: [[CONV7:%.*]] = trunc i32 [[XOR6]] to i8
// CHECK3-NEXT: store i8 [[CONV7]], ptr [[TMP0]], align 1
// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK3-NEXT: [[TMP14:%.*]] = load float, ptr [[D2]], align 4
// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP13]], [[TMP14]]
// CHECK3-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP1]], align 4
// CHECK3-NEXT: [[TMP13:%.*]] = load float, ptr [[D2]], align 4
// CHECK3-NEXT: [[MUL8:%.*]] = fmul float [[TMP12]], [[TMP13]]
// CHECK3-NEXT: store float [[MUL8]], ptr [[TMP1]], align 4
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -3121,29 +3121,29 @@ int bar(int n){
// CHECK3-NEXT: store ptr [[A1]], ptr [[TMP6]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: store ptr [[B2]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP8:%.*]] = load ptr, ptr @"_openmp_teams_reductions_buffer_$_$ptr", align 4
// CHECK3-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr [[TMP8]], i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK3-NEXT: [[TMP10:%.*]] = icmp eq i32 [[TMP9]], 1
// CHECK3-NEXT: br i1 [[TMP10]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// CHECK3-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr @[[GLOB1]], i32 [[TMP5]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 2048, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_omp_reduction_shuffle_and_reduce_func9, ptr @_omp_reduction_inter_warp_copy_func10, ptr @_omp_reduction_list_to_global_copy_func11, ptr @_omp_reduction_list_to_global_reduce_func12, ptr @_omp_reduction_global_to_list_copy_func13, ptr @_omp_reduction_global_to_list_reduce_func14)
// CHECK3-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 1
// CHECK3-NEXT: br i1 [[TMP9]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// CHECK3: .omp.reduction.then:
// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK3-NEXT: [[TMP12:%.*]] = load i32, ptr [[A1]], align 4
// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP11]], [[TMP12]]
// CHECK3-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP0]], align 4
// CHECK3-NEXT: [[TMP11:%.*]] = load i32, ptr [[A1]], align 4
// CHECK3-NEXT: [[OR:%.*]] = or i32 [[TMP10]], [[TMP11]]
// CHECK3-NEXT: store i32 [[OR]], ptr [[TMP0]], align 4
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32
// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[B2]], align 2
// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP14]] to i32
// CHECK3-NEXT: [[TMP12:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK3-NEXT: [[CONV:%.*]] = sext i16 [[TMP12]] to i32
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[B2]], align 2
// CHECK3-NEXT: [[CONV3:%.*]] = sext i16 [[TMP13]] to i32
// CHECK3-NEXT: [[CMP:%.*]] = icmp sgt i32 [[CONV]], [[CONV3]]
// CHECK3-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK3: cond.true:
// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK3-NEXT: [[TMP14:%.*]] = load i16, ptr [[TMP1]], align 2
// CHECK3-NEXT: br label [[COND_END:%.*]]
// CHECK3: cond.false:
// CHECK3-NEXT: [[TMP16:%.*]] = load i16, ptr [[B2]], align 2
// CHECK3-NEXT: [[TMP15:%.*]] = load i16, ptr [[B2]], align 2
// CHECK3-NEXT: br label [[COND_END]]
// CHECK3: cond.end:
// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP15]], [[COND_TRUE]] ], [ [[TMP16]], [[COND_FALSE]] ]
// CHECK3-NEXT: [[COND:%.*]] = phi i16 [ [[TMP14]], [[COND_TRUE]] ], [ [[TMP15]], [[COND_FALSE]] ]
// CHECK3-NEXT: store i16 [[COND]], ptr [[TMP1]], align 2
// CHECK3-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP5]])
// CHECK3-NEXT: br label [[DOTOMP_REDUCTION_DONE]]
@ -3551,13 +3551,13 @@ int bar(int n){
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4
// CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP8]], align 128
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP11]], align 2
// CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP12]], align 128
@ -3577,11 +3577,11 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4
@ -3603,13 +3603,13 @@ int bar(int n){
// CHECK3-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP6]], align 4
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP4]], i32 0, i32 0
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 128
// CHECK3-NEXT: store i32 [[TMP9]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x ptr], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP11:%.*]] = load ptr, ptr [[TMP10]], align 4
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP4]], i32 0, i32 1
// CHECK3-NEXT: [[TMP12:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP5]]
// CHECK3-NEXT: [[TMP13:%.*]] = load i16, ptr [[TMP12]], align 128
// CHECK3-NEXT: store i16 [[TMP13]], ptr [[TMP11]], align 2
@ -3629,11 +3629,11 @@ int bar(int n){
// CHECK3-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTADDR1]], align 4
// CHECK3-NEXT: [[TMP5:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2:%.*]], ptr [[TMP3]], i32 0, i32 0
// CHECK3-NEXT: [[TMP6:%.*]] = getelementptr inbounds [2048 x i32], ptr [[A]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP6]], ptr [[TMP5]], align 4
// CHECK3-NEXT: [[TMP7:%.*]] = getelementptr inbounds [2 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_1]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[B:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_2]], ptr [[TMP3]], i32 0, i32 1
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2048 x i16], ptr [[B]], i32 0, i32 [[TMP4]]
// CHECK3-NEXT: store ptr [[TMP8]], ptr [[TMP7]], align 4
// CHECK3-NEXT: [[TMP9:%.*]] = load ptr, ptr [[DOTADDR2]], align 4

View File

@ -1328,24 +1328,24 @@ int foo() {
// IR-GPU-NEXT: [[TMP38:%.*]] = load i32, ptr [[TMP37]], align 4
// IR-GPU-NEXT: [[TMP39:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], i64 0, i64 0
// IR-GPU-NEXT: store ptr [[SUM1_ASCAST]], ptr [[TMP39]], align 8
// IR-GPU-NEXT: [[TMP40:%.*]] = load ptr, ptr addrspace(1) @"_openmp_teams_reductions_buffer_$_$ptr", align 8
// IR-GPU-NEXT: [[TMP41:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr [[TMP40]], i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// IR-GPU-NEXT: [[TMP42:%.*]] = icmp eq i32 [[TMP41]], 1
// IR-GPU-NEXT: br i1 [[TMP42]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// IR-GPU-NEXT: %"_openmp_teams_reductions_buffer_$_$ptr" = call ptr @__kmpc_reduction_get_fixed_buffer()
// IR-GPU-NEXT: [[TMP40:%.*]] = call i32 @__kmpc_nvptx_teams_reduce_nowait_v2(ptr addrspacecast (ptr addrspace(1) @[[GLOB1]] to ptr), i32 [[TMP38]], ptr %"_openmp_teams_reductions_buffer_$_$ptr", i32 1024, ptr [[DOTOMP_REDUCTION_RED_LIST_ASCAST]], ptr @_omp_reduction_shuffle_and_reduce_func.1, ptr @_omp_reduction_inter_warp_copy_func.2, ptr @_omp_reduction_list_to_global_copy_func, ptr @_omp_reduction_list_to_global_reduce_func, ptr @_omp_reduction_global_to_list_copy_func, ptr @_omp_reduction_global_to_list_reduce_func)
// IR-GPU-NEXT: [[TMP41:%.*]] = icmp eq i32 [[TMP40]], 1
// IR-GPU-NEXT: br i1 [[TMP41]], label [[DOTOMP_REDUCTION_THEN:%.*]], label [[DOTOMP_REDUCTION_DONE:%.*]]
// IR-GPU: .omp.reduction.then:
// IR-GPU-NEXT: [[TMP43:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP43]]
// IR-GPU-NEXT: [[TMP42:%.*]] = getelementptr i32, ptr [[TMP0]], i64 100
// IR-GPU-NEXT: [[OMP_ARRAYCPY_ISEMPTY:%.*]] = icmp eq ptr [[TMP0]], [[TMP42]]
// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_ISEMPTY]], label [[OMP_ARRAYCPY_DONE17:%.*]], label [[OMP_ARRAYCPY_BODY:%.*]]
// IR-GPU: omp.arraycpy.body:
// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRCELEMENTPAST:%.*]] = phi ptr [ [[SUM1_ASCAST]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_SRC_ELEMENT:%.*]], [[OMP_ARRAYCPY_BODY]] ]
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DESTELEMENTPAST13:%.*]] = phi ptr [ [[TMP0]], [[DOTOMP_REDUCTION_THEN]] ], [ [[OMP_ARRAYCPY_DEST_ELEMENT15:%.*]], [[OMP_ARRAYCPY_BODY]] ]
// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
// IR-GPU-NEXT: [[TMP45:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP44]], [[TMP45]]
// IR-GPU-NEXT: [[TMP43:%.*]] = load i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
// IR-GPU-NEXT: [[TMP44:%.*]] = load i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], align 4
// IR-GPU-NEXT: [[ADD14:%.*]] = add nsw i32 [[TMP43]], [[TMP44]]
// IR-GPU-NEXT: store i32 [[ADD14]], ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], align 4
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DEST_ELEMENT15]] = getelementptr i32, ptr [[OMP_ARRAYCPY_DESTELEMENTPAST13]], i32 1
// IR-GPU-NEXT: [[OMP_ARRAYCPY_SRC_ELEMENT]] = getelementptr i32, ptr [[OMP_ARRAYCPY_SRCELEMENTPAST]], i32 1
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP43]]
// IR-GPU-NEXT: [[OMP_ARRAYCPY_DONE16:%.*]] = icmp eq ptr [[OMP_ARRAYCPY_DEST_ELEMENT15]], [[TMP42]]
// IR-GPU-NEXT: br i1 [[OMP_ARRAYCPY_DONE16]], label [[OMP_ARRAYCPY_DONE17]], label [[OMP_ARRAYCPY_BODY]]
// IR-GPU: omp.arraycpy.done17:
// IR-GPU-NEXT: call void @__kmpc_nvptx_end_reduce_nowait(i32 [[TMP38]])

View File

@ -234,6 +234,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId);
void __kmpc_nvptx_end_reduce_nowait(int32_t TId);
void *__kmpc_reduction_get_fixed_buffer();
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
IdentTy *Loc, int32_t TId, int32_t num_vars, uint64_t reduce_size,
void *reduce_data, ShuffleReductFnTy shflFct, InterWarpCopyFnTy cpyFct);

View File

@ -167,9 +167,6 @@ uint32_t roundToWarpsize(uint32_t s) {
uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
static uint32_t IterCnt = 0;
static uint32_t Cnt = 0;
} // namespace
extern "C" {
@ -194,6 +191,9 @@ int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
ThreadId = 0;
}
uint32_t &IterCnt = state::getKernelLaunchEnvironment().ReductionIterCnt;
uint32_t &Cnt = state::getKernelLaunchEnvironment().ReductionCnt;
// In non-generic mode all workers participate in the teams reduction.
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
@ -313,4 +313,8 @@ void __kmpc_nvptx_end_reduce(int32_t TId) {}
void __kmpc_nvptx_end_reduce_nowait(int32_t TId) {}
}
void *__kmpc_reduction_get_fixed_buffer() {
return state::getKernelLaunchEnvironment().ReductionBuffer;
}
#pragma omp end declare target

View File

@ -77,15 +77,16 @@ struct DynamicEnvironmentTy {
// NOTE: Please don't change the order of those members as their indices are
// used in the middle end. Always add the new data member at the end.
struct ConfigurationEnvironmentTy {
uint8_t UseGenericStateMachine;
uint8_t MayUseNestedParallelism;
llvm::omp::OMPTgtExecModeFlags ExecMode;
uint8_t UseGenericStateMachine = 2;
uint8_t MayUseNestedParallelism = 2;
llvm::omp::OMPTgtExecModeFlags ExecMode = llvm::omp::OMP_TGT_EXEC_MODE_SPMD;
// Information about (legal) launch configurations.
//{
int32_t MinThreads;
int32_t MaxThreads;
int32_t MinTeams;
int32_t MaxTeams;
int32_t MinThreads = -1;
int32_t MaxThreads = -1;
int32_t MinTeams = -1;
int32_t MaxTeams = -1;
int32_t ReductionBufferSize = 0;
//}
};
@ -93,10 +94,14 @@ struct ConfigurationEnvironmentTy {
// used in the middle end. Always add the new data member at the end.
struct KernelEnvironmentTy {
ConfigurationEnvironmentTy Configuration;
IdentTy *Ident;
DynamicEnvironmentTy *DynamicEnv;
IdentTy *Ident = nullptr;
DynamicEnvironmentTy *DynamicEnv = nullptr;
};
struct KernelLaunchEnvironmentTy {};
struct KernelLaunchEnvironmentTy {
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
void *ReductionBuffer = nullptr;
};
#endif // _OMPTARGET_ENVIRONMENT_H_

View File

@ -402,9 +402,8 @@ Error GenericKernelTy::init(GenericDeviceTy &GenericDevice,
DP("Failed to read kernel environment for '%s': %s\n"
"Using default SPMD (2) execution mode\n",
Name, ErrStr.data());
KernelEnvironment.Configuration.ExecMode = OMP_TGT_EXEC_MODE_SPMD;
KernelEnvironment.Configuration.MayUseNestedParallelism = /*Unknown=*/2;
KernelEnvironment.Configuration.UseGenericStateMachine = /*Unknown=*/2;
assert(KernelEnvironment.Configuration.ReductionBufferSize == 0 &&
"Default initialization failed.");
}
// Max = Config.Max > 0 ? min(Config.Max, Device.Max) : Device.Max;
@ -441,6 +440,17 @@ GenericKernelTy::getKernelLaunchEnvironment(
/// async data transfer.
auto &LocalKLE = (*AsyncInfoWrapper).KernelLaunchEnvironment;
LocalKLE = KernelLaunchEnvironment;
if (KernelEnvironment.Configuration.ReductionBufferSize) {
auto AllocOrErr = GenericDevice.dataAlloc(
KernelEnvironment.Configuration.ReductionBufferSize,
/*HostPtr=*/nullptr, TargetAllocTy::TARGET_ALLOC_DEVICE);
if (!AllocOrErr)
return AllocOrErr.takeError();
LocalKLE.ReductionBuffer = *AllocOrErr;
// Remember to free the memory later.
AsyncInfoWrapper.freeAllocationAfterSynchronization(*AllocOrErr);
}
auto Err = GenericDevice.dataSubmit(*AllocOrErr, &LocalKLE,
sizeof(KernelLaunchEnvironmentTy),
AsyncInfoWrapper);

View File

@ -0,0 +1,36 @@
// RUN: %libomptarget-compilexx-run-and-check-generic
// RUN: %libomptarget-compileoptxx-run-and-check-generic
// FIXME: This is a bug in host offload, this should run fine.
// UNSUPPORTED: aarch64-unknown-linux-gnu
// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
// UNSUPPORTED: x86_64-pc-linux-gnu
// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
#include <iostream>
#include <vector>
#define N 8
int main() {
std::vector<int> avec(N);
int *a = avec.data();
#pragma omp parallel for
for (int i = 0; i < N; i++) {
a[i] = 0;
#pragma omp target teams distribute parallel for reduction(+ : a[i])
for (int j = 0; j < N; j++)
a[i] += 1;
}
// CHECK: 8
// CHECK: 8
// CHECK: 8
// CHECK: 8
// CHECK: 8
// CHECK: 8
// CHECK: 8
// CHECK: 8
for (int i = 0; i < N; i++)
std::cout << a[i] << std::endl;
}