llvm-docs/clang_doxygen/CGOpenMPRuntimeGPU_8cpp_source.html

 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//

 //

 // This provides a generalized class for OpenMP runtime code generation

 // specialized by GPU targets NVPTX and AMDGCN.

 //

 //===----------------------------------------------------------------------===//


 #include "CGOpenMPRuntimeGPU.h"

 #include "CodeGenFunction.h"

 #include "clang/AST/Attr.h"

 #include "clang/AST/DeclOpenMP.h"

 #include "clang/AST/OpenMPClause.h"

 #include "clang/AST/StmtOpenMP.h"

 #include "clang/AST/StmtVisitor.h"

 #include "clang/Basic/Cuda.h"

 #include "llvm/ADT/SmallPtrSet.h"

 #include "llvm/Frontend/OpenMP/OMPGridValues.h"

 #include "llvm/Support/MathExtras.h"


 using namespace clang;

 using namespace CodeGen;

 using namespace llvm::omp;


 namespace {

 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.

 class NVPTXActionTy final : public PrePostActionTy {

   llvm::FunctionCallee EnterCallee = nullptr;

   ArrayRef<llvm::Value *> EnterArgs;

   llvm::FunctionCallee ExitCallee = nullptr;

   ArrayRef<llvm::Value *> ExitArgs;

   bool Conditional = false;

   llvm::BasicBlock *ContBlock = nullptr;


 public:

   NVPTXActionTy(llvm::FunctionCallee EnterCallee,

                 ArrayRef<llvm::Value *> EnterArgs,

                 llvm::FunctionCallee ExitCallee,

                 ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)

       : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),

         ExitArgs(ExitArgs), Conditional(Conditional) {}

   void Enter(CodeGenFunction &CGF) override {

     llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);

     if (Conditional) {

       llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);

       auto *ThenBlock = CGF.createBasicBlock("omp_if.then");

       ContBlock = CGF.createBasicBlock("omp_if.end");

       // Generate the branch (If-stmt)

       CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);

       CGF.EmitBlock(ThenBlock);

     }

   }

   void Done(CodeGenFunction &CGF) {

     // Emit the rest of blocks/branches

     CGF.EmitBranch(ContBlock);

     CGF.EmitBlock(ContBlock, true);

   }

   void Exit(CodeGenFunction &CGF) override {

     CGF.EmitRuntimeCall(ExitCallee, ExitArgs);

   }

 };


 /// A class to track the execution mode when codegening directives within

 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry

 /// to the target region and used by containing directives such as 'parallel'

 /// to emit optimized code.

 class ExecutionRuntimeModesRAII {

 private:

   CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =

       CGOpenMPRuntimeGPU::EM_Unknown;

   CGOpenMPRuntimeGPU::ExecutionMode &ExecMode;


 public:

   ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,

                             CGOpenMPRuntimeGPU::ExecutionMode EntryMode)

       : ExecMode(ExecMode) {

     SavedExecMode = ExecMode;

     ExecMode = EntryMode;

   }

   ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }

 };


 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {

   RefExpr = RefExpr->IgnoreParens();

   if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {

     const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();

     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))

       Base = TempASE->getBase()->IgnoreParenImpCasts();

     RefExpr = Base;

   } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {

     const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();

     while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))

       Base = TempOASE->getBase()->IgnoreParenImpCasts();

     while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))

       Base = TempASE->getBase()->IgnoreParenImpCasts();

     RefExpr = Base;

   }

   RefExpr = RefExpr->IgnoreParenImpCasts();

   if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))

     return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());

   const auto *ME = cast<MemberExpr>(RefExpr);

   return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());

 }


 static RecordDecl *buildRecordForGlobalizedVars(

     ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,

     ArrayRef<const ValueDecl *> EscapedDeclsForTeams,

     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &MappedDeclsFields,

     int BufSize) {

   using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;

   if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())

     return nullptr;

   SmallVector<VarsDataTy, 4> GlobalizedVars;

   for (const ValueDecl *D : EscapedDecls)

     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);

   for (const ValueDecl *D : EscapedDeclsForTeams)

     GlobalizedVars.emplace_back(C.getDeclAlign(D), D);


   // Build struct _globalized_locals_ty {

   //         /*  globalized vars  */[WarSize] align (decl_align)

   //         /*  globalized vars  */ for EscapedDeclsForTeams

   //       };

   RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");

   GlobalizedRD->startDefinition();

   llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(

       EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());

   for (const auto &Pair : GlobalizedVars) {

     const ValueDecl *VD = Pair.second;

     QualType Type = VD->getType();

     if (Type->isLValueReferenceType())

       Type = C.getPointerType(Type.getNonReferenceType());

     else

       Type = Type.getNonReferenceType();

     SourceLocation Loc = VD->getLocation();

     FieldDecl *Field;

     if (SingleEscaped.count(VD)) {

       Field = FieldDecl::Create(

           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,

           C.getTrivialTypeSourceInfo(Type, SourceLocation()),

           /*BW=*/nullptr, /*Mutable=*/false,

           /*InitStyle=*/ICIS_NoInit);

       Field->setAccess(AS_public);

       if (VD->hasAttrs()) {

         for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),

              E(VD->getAttrs().end());

              I != E; ++I)

           Field->addAttr(*I);

       }

     } else {

       if (BufSize > 1) {

         llvm::APInt ArraySize(32, BufSize);

         Type = C.getConstantArrayType(Type, ArraySize, nullptr,

                                       ArraySizeModifier::Normal, 0);

       }

       Field = FieldDecl::Create(

           C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,

           C.getTrivialTypeSourceInfo(Type, SourceLocation()),

           /*BW=*/nullptr, /*Mutable=*/false,

           /*InitStyle=*/ICIS_NoInit);

       Field->setAccess(AS_public);

       llvm::APInt Align(32, Pair.first.getQuantity());

       Field->addAttr(AlignedAttr::CreateImplicit(

           C, /*IsAlignmentExpr=*/true,

           IntegerLiteral::Create(C, Align,

                                  C.getIntTypeForBitwidth(32, /*Signed=*/0),

                                  SourceLocation()),

           {}, AlignedAttr::GNU_aligned));

     }

     GlobalizedRD->addDecl(Field);

     MappedDeclsFields.try_emplace(VD, Field);

   }

   GlobalizedRD->completeDefinition();

   return GlobalizedRD;

 }


 /// Get the list of variables that can escape their declaration context.

 class CheckVarsEscapingDeclContext final

     : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {

   CodeGenFunction &CGF;

   llvm::SetVector<const ValueDecl *> EscapedDecls;

   llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;

   llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;

   llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;

   RecordDecl *GlobalizedRD = nullptr;

   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;

   bool AllEscaped = false;

   bool IsForCombinedParallelRegion = false;


   void markAsEscaped(const ValueDecl *VD) {

     // Do not globalize declare target variables.

     if (!isa<VarDecl>(VD) ||

         OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))

       return;

     VD = cast<ValueDecl>(VD->getCanonicalDecl());

     // Use user-specified allocation.

     if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())

       return;

     // Variables captured by value must be globalized.

     bool IsCaptured = false;

     if (auto *CSI = CGF.CapturedStmtInfo) {

       if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {

         // Check if need to capture the variable that was already captured by

         // value in the outer region.

         IsCaptured = true;

         if (!IsForCombinedParallelRegion) {

           if (!FD->hasAttrs())

             return;

           const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();

           if (!Attr)

             return;

           if (((Attr->getCaptureKind() != OMPC_map) &&

                !isOpenMPPrivate(Attr->getCaptureKind())) ||

               ((Attr->getCaptureKind() == OMPC_map) &&

                !FD->getType()->isAnyPointerType()))

             return;

         }

         if (!FD->getType()->isReferenceType()) {

           assert(!VD->getType()->isVariablyModifiedType() &&

                  "Parameter captured by value with variably modified type");

           EscapedParameters.insert(VD);

         } else if (!IsForCombinedParallelRegion) {

           return;

         }

       }

     }

     if ((!CGF.CapturedStmtInfo ||

          (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&

         VD->getType()->isReferenceType())

       // Do not globalize variables with reference type.

       return;

     if (VD->getType()->isVariablyModifiedType()) {

       // If not captured at the target region level then mark the escaped

       // variable as delayed.

       if (IsCaptured)

         EscapedVariableLengthDecls.insert(VD);

       else

         DelayedVariableLengthDecls.insert(VD);

     } else

       EscapedDecls.insert(VD);

   }


   void VisitValueDecl(const ValueDecl *VD) {

     if (VD->getType()->isLValueReferenceType())

       markAsEscaped(VD);

     if (const auto *VarD = dyn_cast<VarDecl>(VD)) {

       if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {

         const bool SavedAllEscaped = AllEscaped;

         AllEscaped = VD->getType()->isLValueReferenceType();

         Visit(VarD->getInit());

         AllEscaped = SavedAllEscaped;

       }

     }

   }

   void VisitOpenMPCapturedStmt(const CapturedStmt *S,

                                ArrayRef<OMPClause *> Clauses,

                                bool IsCombinedParallelRegion) {

     if (!S)

       return;

     for (const CapturedStmt::Capture &C : S->captures()) {

       if (C.capturesVariable() && !C.capturesVariableByCopy()) {

         const ValueDecl *VD = C.getCapturedVar();

         bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;

         if (IsCombinedParallelRegion) {

           // Check if the variable is privatized in the combined construct and

           // those private copies must be shared in the inner parallel

           // directive.

           IsForCombinedParallelRegion = false;

           for (const OMPClause *C : Clauses) {

             if (!isOpenMPPrivate(C->getClauseKind()) ||

                 C->getClauseKind() == OMPC_reduction ||

                 C->getClauseKind() == OMPC_linear ||

                 C->getClauseKind() == OMPC_private)

               continue;

             ArrayRef<const Expr *> Vars;

             if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))

               Vars = PC->getVarRefs();

             else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))

               Vars = PC->getVarRefs();

             else

               llvm_unreachable("Unexpected clause.");

             for (const auto *E : Vars) {

               const Decl *D =

                   cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();

               if (D == VD->getCanonicalDecl()) {

                 IsForCombinedParallelRegion = true;

                 break;

               }

             }

             if (IsForCombinedParallelRegion)

               break;

           }

         }

         markAsEscaped(VD);

         if (isa<OMPCapturedExprDecl>(VD))

           VisitValueDecl(VD);

         IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;

       }

     }

   }


   void buildRecordForGlobalizedVars(bool IsInTTDRegion) {

     assert(!GlobalizedRD &&

            "Record for globalized variables is built already.");

     ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;

     unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;

     if (IsInTTDRegion)

       EscapedDeclsForTeams = EscapedDecls.getArrayRef();

     else

       EscapedDeclsForParallel = EscapedDecls.getArrayRef();

     GlobalizedRD = ::buildRecordForGlobalizedVars(

         CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,

         MappedDeclsFields, WarpSize);

   }


 public:

   CheckVarsEscapingDeclContext(CodeGenFunction &CGF,

                                ArrayRef<const ValueDecl *> TeamsReductions)

       : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {

   }

   virtual ~CheckVarsEscapingDeclContext() = default;

   void VisitDeclStmt(const DeclStmt *S) {

     if (!S)

       return;

     for (const Decl *D : S->decls())

       if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))

         VisitValueDecl(VD);

   }

   void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {

     if (!D)

       return;

     if (!D->hasAssociatedStmt())

       return;

     if (const auto *S =

             dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {

       // Do not analyze directives that do not actually require capturing,

       // like `omp for` or `omp simd` directives.

       llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;

       getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());

       if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {

         VisitStmt(S->getCapturedStmt());

         return;

       }

       VisitOpenMPCapturedStmt(

           S, D->clauses(),

           CaptureRegions.back() == OMPD_parallel &&

               isOpenMPDistributeDirective(D->getDirectiveKind()));

     }

   }

   void VisitCapturedStmt(const CapturedStmt *S) {

     if (!S)

       return;

     for (const CapturedStmt::Capture &C : S->captures()) {

       if (C.capturesVariable() && !C.capturesVariableByCopy()) {

         const ValueDecl *VD = C.getCapturedVar();

         markAsEscaped(VD);

         if (isa<OMPCapturedExprDecl>(VD))

           VisitValueDecl(VD);

       }

     }

   }

   void VisitLambdaExpr(const LambdaExpr *E) {

     if (!E)

       return;

     for (const LambdaCapture &C : E->captures()) {

       if (C.capturesVariable()) {

         if (C.getCaptureKind() == LCK_ByRef) {

           const ValueDecl *VD = C.getCapturedVar();

           markAsEscaped(VD);

           if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))

             VisitValueDecl(VD);

         }

       }

     }

   }

   void VisitBlockExpr(const BlockExpr *E) {

     if (!E)

       return;

     for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {

       if (C.isByRef()) {

         const VarDecl *VD = C.getVariable();

         markAsEscaped(VD);

         if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())

           VisitValueDecl(VD);

       }

     }

   }

   void VisitCallExpr(const CallExpr *E) {

     if (!E)

       return;

     for (const Expr *Arg : E->arguments()) {

       if (!Arg)

         continue;

       if (Arg->isLValue()) {

         const bool SavedAllEscaped = AllEscaped;

         AllEscaped = true;

         Visit(Arg);

         AllEscaped = SavedAllEscaped;

       } else {

         Visit(Arg);

       }

     }

     Visit(E->getCallee());

   }

   void VisitDeclRefExpr(const DeclRefExpr *E) {

     if (!E)

       return;

     const ValueDecl *VD = E->getDecl();

     if (AllEscaped)

       markAsEscaped(VD);

     if (isa<OMPCapturedExprDecl>(VD))

       VisitValueDecl(VD);

     else if (VD->isInitCapture())

       VisitValueDecl(VD);

   }

   void VisitUnaryOperator(const UnaryOperator *E) {

     if (!E)

       return;

     if (E->getOpcode() == UO_AddrOf) {

       const bool SavedAllEscaped = AllEscaped;

       AllEscaped = true;

       Visit(E->getSubExpr());

       AllEscaped = SavedAllEscaped;

     } else {

       Visit(E->getSubExpr());

     }

   }

   void VisitImplicitCastExpr(const ImplicitCastExpr *E) {

     if (!E)

       return;

     if (E->getCastKind() == CK_ArrayToPointerDecay) {

       const bool SavedAllEscaped = AllEscaped;

       AllEscaped = true;

       Visit(E->getSubExpr());

       AllEscaped = SavedAllEscaped;

     } else {

       Visit(E->getSubExpr());

     }

   }

   void VisitExpr(const Expr *E) {

     if (!E)

       return;

     bool SavedAllEscaped = AllEscaped;

     if (!E->isLValue())

       AllEscaped = false;

     for (const Stmt *Child : E->children())

       if (Child)

         Visit(Child);

     AllEscaped = SavedAllEscaped;

   }

   void VisitStmt(const Stmt *S) {

     if (!S)

       return;

     for (const Stmt *Child : S->children())

       if (Child)

         Visit(Child);

   }


   /// Returns the record that handles all the escaped local variables and used

   /// instead of their original storage.

   const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {

     if (!GlobalizedRD)

       buildRecordForGlobalizedVars(IsInTTDRegion);

     return GlobalizedRD;

   }


   /// Returns the field in the globalized record for the escaped variable.

   const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {

     assert(GlobalizedRD &&

            "Record for globalized variables must be generated already.");

     return MappedDeclsFields.lookup(VD);

   }


   /// Returns the list of the escaped local variables/parameters.

   ArrayRef<const ValueDecl *> getEscapedDecls() const {

     return EscapedDecls.getArrayRef();

   }


   /// Checks if the escaped local variable is actually a parameter passed by

   /// value.

   const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {

     return EscapedParameters;

   }


   /// Returns the list of the escaped variables with the variably modified

   /// types.

   ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {

     return EscapedVariableLengthDecls.getArrayRef();

   }


   /// Returns the list of the delayed variables with the variably modified

   /// types.

   ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {

     return DelayedVariableLengthDecls.getArrayRef();

   }

 };

 } // anonymous namespace


 /// Get the id of the warp in the block.

 /// We assume that the warp size is 32, which is always the case

 /// on the NVPTX device, to generate more efficient code.

 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {

   CGBuilderTy &Bld = CGF.Builder;

   unsigned LaneIDBits =

       llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);

   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

   return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");

 }


 /// Get the id of the current lane in the Warp.

 /// We assume that the warp size is 32, which is always the case

 /// on the NVPTX device, to generate more efficient code.

 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {

   CGBuilderTy &Bld = CGF.Builder;

   unsigned LaneIDBits =

       llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);

   assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");

   unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);

   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

   return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),

                        "nvptx_lane_id");

 }


 CGOpenMPRuntimeGPU::ExecutionMode

 CGOpenMPRuntimeGPU::getExecutionMode() const {

   return CurrentExecutionMode;

 }


 CGOpenMPRuntimeGPU::DataSharingMode

 CGOpenMPRuntimeGPU::getDataSharingMode() const {

   return CurrentDataSharingMode;

 }


 /// Check for inner (nested) SPMD construct, if any

 static bool hasNestedSPMDDirective(ASTContext &Ctx,

                                    const OMPExecutableDirective &D) {

   const auto *CS = D.getInnermostCapturedStmt();

   const auto *Body =

       CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);

   const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);


   if (const auto *NestedDir =

           dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {

     OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();

     switch (D.getDirectiveKind()) {

     case OMPD_target:

       if (isOpenMPParallelDirective(DKind))

         return true;

       if (DKind == OMPD_teams) {

         Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(

             /*IgnoreCaptured=*/true);

         if (!Body)

           return false;

         ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);

         if (const auto *NND =

                 dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {

           DKind = NND->getDirectiveKind();

           if (isOpenMPParallelDirective(DKind))

             return true;

         }

       }

       return false;

     case OMPD_target_teams:

       return isOpenMPParallelDirective(DKind);

     case OMPD_target_simd:

     case OMPD_target_parallel:

     case OMPD_target_parallel_for:

     case OMPD_target_parallel_for_simd:

     case OMPD_target_teams_distribute:

     case OMPD_target_teams_distribute_simd:

     case OMPD_target_teams_distribute_parallel_for:

     case OMPD_target_teams_distribute_parallel_for_simd:

     case OMPD_parallel:

     case OMPD_for:

     case OMPD_parallel_for:

     case OMPD_parallel_master:

     case OMPD_parallel_sections:

     case OMPD_for_simd:

     case OMPD_parallel_for_simd:

     case OMPD_cancel:

     case OMPD_cancellation_point:

     case OMPD_ordered:

     case OMPD_threadprivate:

     case OMPD_allocate:

     case OMPD_task:

     case OMPD_simd:

     case OMPD_sections:

     case OMPD_section:

     case OMPD_single:

     case OMPD_master:

     case OMPD_critical:

     case OMPD_taskyield:

     case OMPD_barrier:

     case OMPD_taskwait:

     case OMPD_taskgroup:

     case OMPD_atomic:

     case OMPD_flush:

     case OMPD_depobj:

     case OMPD_scan:

     case OMPD_teams:

     case OMPD_target_data:

     case OMPD_target_exit_data:

     case OMPD_target_enter_data:

     case OMPD_distribute:

     case OMPD_distribute_simd:

     case OMPD_distribute_parallel_for:

     case OMPD_distribute_parallel_for_simd:

     case OMPD_teams_distribute:

     case OMPD_teams_distribute_simd:

     case OMPD_teams_distribute_parallel_for:

     case OMPD_teams_distribute_parallel_for_simd:

     case OMPD_target_update:

     case OMPD_declare_simd:

     case OMPD_declare_variant:

     case OMPD_begin_declare_variant:

     case OMPD_end_declare_variant:

     case OMPD_declare_target:

     case OMPD_end_declare_target:

     case OMPD_declare_reduction:

     case OMPD_declare_mapper:

     case OMPD_taskloop:

     case OMPD_taskloop_simd:

     case OMPD_master_taskloop:

     case OMPD_master_taskloop_simd:

     case OMPD_parallel_master_taskloop:

     case OMPD_parallel_master_taskloop_simd:

     case OMPD_requires:

     case OMPD_unknown:

     default:

       llvm_unreachable("Unexpected directive.");

     }

   }


   return false;

 }


 static bool supportsSPMDExecutionMode(ASTContext &Ctx,

                                       const OMPExecutableDirective &D) {

   OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();

   switch (DirectiveKind) {

   case OMPD_target:

   case OMPD_target_teams:

     return hasNestedSPMDDirective(Ctx, D);

   case OMPD_target_parallel_loop:

   case OMPD_target_parallel:

   case OMPD_target_parallel_for:

   case OMPD_target_parallel_for_simd:

   case OMPD_target_teams_distribute_parallel_for:

   case OMPD_target_teams_distribute_parallel_for_simd:

   case OMPD_target_simd:

   case OMPD_target_teams_distribute_simd:

     return true;

   case OMPD_target_teams_distribute:

     return false;

   case OMPD_target_teams_loop:

     // Whether this is true or not depends on how the directive will

     // eventually be emitted.

     if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))

       return TTLD->canBeParallelFor();

     return false;

   case OMPD_parallel:

   case OMPD_for:

   case OMPD_parallel_for:

   case OMPD_parallel_master:

   case OMPD_parallel_sections:

   case OMPD_for_simd:

   case OMPD_parallel_for_simd:

   case OMPD_cancel:

   case OMPD_cancellation_point:

   case OMPD_ordered:

   case OMPD_threadprivate:

   case OMPD_allocate:

   case OMPD_task:

   case OMPD_simd:

   case OMPD_sections:

   case OMPD_section:

   case OMPD_single:

   case OMPD_master:

   case OMPD_critical:

   case OMPD_taskyield:

   case OMPD_barrier:

   case OMPD_taskwait:

   case OMPD_taskgroup:

   case OMPD_atomic:

   case OMPD_flush:

   case OMPD_depobj:

   case OMPD_scan:

   case OMPD_teams:

   case OMPD_target_data:

   case OMPD_target_exit_data:

   case OMPD_target_enter_data:

   case OMPD_distribute:

   case OMPD_distribute_simd:

   case OMPD_distribute_parallel_for:

   case OMPD_distribute_parallel_for_simd:

   case OMPD_teams_distribute:

   case OMPD_teams_distribute_simd:

   case OMPD_teams_distribute_parallel_for:

   case OMPD_teams_distribute_parallel_for_simd:

   case OMPD_target_update:

   case OMPD_declare_simd:

   case OMPD_declare_variant:

   case OMPD_begin_declare_variant:

   case OMPD_end_declare_variant:

   case OMPD_declare_target:

   case OMPD_end_declare_target:

   case OMPD_declare_reduction:

   case OMPD_declare_mapper:

   case OMPD_taskloop:

   case OMPD_taskloop_simd:

   case OMPD_master_taskloop:

   case OMPD_master_taskloop_simd:

   case OMPD_parallel_master_taskloop:

   case OMPD_parallel_master_taskloop_simd:

   case OMPD_requires:

   case OMPD_unknown:

   default:

     break;

   }

   llvm_unreachable(

       "Unknown programming model for OpenMP directive on NVPTX target.");

 }


 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,

                                              StringRef ParentName,

                                              llvm::Function *&OutlinedFn,

                                              llvm::Constant *&OutlinedFnID,

                                              bool IsOffloadEntry,

                                              const RegionCodeGenTy &CodeGen) {

   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);

   EntryFunctionState EST;

   WrapperFunctionsMap.clear();


   [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

   assert(!IsBareKernel && "bare kernel should not be at generic mode");


   // Emit target region as a standalone region.

   class NVPTXPrePostActionTy : public PrePostActionTy {

     CGOpenMPRuntimeGPU::EntryFunctionState &EST;

     const OMPExecutableDirective &D;


   public:

     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,

                          const OMPExecutableDirective &D)

         : EST(EST), D(D) {}

     void Enter(CodeGenFunction &CGF) override {

       auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

       RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);

       // Skip target region initialization.

       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);

     }

     void Exit(CodeGenFunction &CGF) override {

       auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

       RT.clearLocThreadIdInsertPt(CGF);

       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);

     }

   } Action(EST, D);

   CodeGen.setAction(Action);

   IsInTTDRegion = true;

   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

                                    IsOffloadEntry, CodeGen);

   IsInTTDRegion = false;

 }


 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,

                                         CodeGenFunction &CGF,

                                         EntryFunctionState &EST, bool IsSPMD) {

   int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,

           MaxTeamsVal = -1;

   computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,

                                   MinTeamsVal, MaxTeamsVal);


   CGBuilderTy &Bld = CGF.Builder;

   Bld.restoreIP(OMPBuilder.createTargetInit(

       Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));

   if (!IsSPMD)

     emitGenericVarsProlog(CGF, EST.Loc);

 }


 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,

                                           EntryFunctionState &EST,

                                           bool IsSPMD) {

   if (!IsSPMD)

     emitGenericVarsEpilog(CGF);


   // This is temporary until we remove the fixed sized buffer.

   ASTContext &C = CGM.getContext();

   RecordDecl *StaticRD = C.buildImplicitRecord(

       "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);

   StaticRD->startDefinition();

   for (const RecordDecl *TeamReductionRec : TeamsReductions) {

     QualType RecTy = C.getRecordType(TeamReductionRec);

     auto *Field = FieldDecl::Create(

         C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,

         C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),

         /*BW=*/nullptr, /*Mutable=*/false,

         /*InitStyle=*/ICIS_NoInit);

     Field->setAccess(AS_public);

     StaticRD->addDecl(Field);

   }

   StaticRD->completeDefinition();

   QualType StaticTy = C.getRecordType(StaticRD);

   llvm::Type *LLVMReductionsBufferTy =

       CGM.getTypes().ConvertTypeForMem(StaticTy);

   const auto &DL = CGM.getModule().getDataLayout();

   uint64_t ReductionDataSize =

       TeamsReductions.empty()

           ? 0

           : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();

   CGBuilderTy &Bld = CGF.Builder;

   OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,

                                 C.getLangOpts().OpenMPCUDAReductionBufNum);

   TeamsReductions.clear();

 }


 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,

                                           StringRef ParentName,

                                           llvm::Function *&OutlinedFn,

                                           llvm::Constant *&OutlinedFnID,

                                           bool IsOffloadEntry,

                                           const RegionCodeGenTy &CodeGen) {

   ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);

   EntryFunctionState EST;


   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();


   // Emit target region as a standalone region.

   class NVPTXPrePostActionTy : public PrePostActionTy {

     CGOpenMPRuntimeGPU &RT;

     CGOpenMPRuntimeGPU::EntryFunctionState &EST;

     bool IsBareKernel;

     DataSharingMode Mode;

     const OMPExecutableDirective &D;


   public:

     NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,

                          CGOpenMPRuntimeGPU::EntryFunctionState &EST,

                          bool IsBareKernel, const OMPExecutableDirective &D)

         : RT(RT), EST(EST), IsBareKernel(IsBareKernel),

           Mode(RT.CurrentDataSharingMode), D(D) {}

     void Enter(CodeGenFunction &CGF) override {

       if (IsBareKernel) {

         RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;

         return;

       }

       RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);

       // Skip target region initialization.

       RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);

     }

     void Exit(CodeGenFunction &CGF) override {

       if (IsBareKernel) {

         RT.CurrentDataSharingMode = Mode;

         return;

       }

       RT.clearLocThreadIdInsertPt(CGF);

       RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);

     }

   } Action(*this, EST, IsBareKernel, D);

   CodeGen.setAction(Action);

   IsInTTDRegion = true;

   emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,

                                    IsOffloadEntry, CodeGen);

   IsInTTDRegion = false;

 }


 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(

     const OMPExecutableDirective &D, StringRef ParentName,

     llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,

     bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {

   if (!IsOffloadEntry) // Nothing to do.

     return;


   assert(!ParentName.empty() && "Invalid target region parent name!");


   bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);

   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();

   if (Mode || IsBareKernel)

     emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

                    CodeGen);

   else

     emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,

                       CodeGen);

 }


 CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)

     : CGOpenMPRuntime(CGM) {

   llvm::OpenMPIRBuilderConfig Config(

       CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),

       CGM.getLangOpts().OpenMPOffloadMandatory,

       /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,

       hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);

   OMPBuilder.setConfig(Config);


   if (!CGM.getLangOpts().OpenMPIsTargetDevice)

     llvm_unreachable("OpenMP can only handle device code.");


   if (CGM.getLangOpts().OpenMPCUDAMode)

     CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;


   llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();

   if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())

     return;


   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,

                               "__omp_rtl_debug_kind");

   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,

                               "__omp_rtl_assume_teams_oversubscription");

   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,

                               "__omp_rtl_assume_threads_oversubscription");

   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,

                               "__omp_rtl_assume_no_thread_state");

   OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,

                               "__omp_rtl_assume_no_nested_parallelism");

 }


 void CGOpenMPRuntimeGPU::emitProcBindClause(CodeGenFunction &CGF,

                                               ProcBindKind ProcBind,

                                               SourceLocation Loc) {

   // Nothing to do.

 }


 void CGOpenMPRuntimeGPU::emitNumThreadsClause(CodeGenFunction &CGF,

                                                 llvm::Value *NumThreads,

                                                 SourceLocation Loc) {

   // Nothing to do.

 }


 void CGOpenMPRuntimeGPU::emitNumTeamsClause(CodeGenFunction &CGF,

                                               const Expr *NumTeams,

                                               const Expr *ThreadLimit,

                                               SourceLocation Loc) {}


 llvm::Function *CGOpenMPRuntimeGPU::emitParallelOutlinedFunction(

     CodeGenFunction &CGF, const OMPExecutableDirective &D,

     const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,

     const RegionCodeGenTy &CodeGen) {

   // Emit target region as a standalone region.

   bool PrevIsInTTDRegion = IsInTTDRegion;

   IsInTTDRegion = false;

   auto *OutlinedFun =

       cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(

           CGF, D, ThreadIDVar, InnermostKind, CodeGen));

   IsInTTDRegion = PrevIsInTTDRegion;

   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {

     llvm::Function *WrapperFun =

         createParallelDataSharingWrapper(OutlinedFun, D);

     WrapperFunctionsMap[OutlinedFun] = WrapperFun;

   }


   return OutlinedFun;

 }


 /// Get list of lastprivate variables from the teams distribute ... or

 /// teams {distribute ...} directives.

 static void

 getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,

                              llvm::SmallVectorImpl<const ValueDecl *> &Vars) {

   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&

          "expected teams directive.");

   const OMPExecutableDirective *Dir = &D;

   if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {

     if (const Stmt *S = CGOpenMPRuntime::getSingleCompoundChild(

             Ctx,

             D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(

                 /*IgnoreCaptured=*/true))) {

       Dir = dyn_cast_or_null<OMPExecutableDirective>(S);

       if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))

         Dir = nullptr;

     }

   }

   if (!Dir)

     return;

   for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {

     for (const Expr *E : C->getVarRefs())

       Vars.push_back(getPrivateItem(E));

   }

 }


 /// Get list of reduction variables from the teams ... directives.

 static void

 getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D,

                       llvm::SmallVectorImpl<const ValueDecl *> &Vars) {

   assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&

          "expected teams directive.");

   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {

     for (const Expr *E : C->privates())

       Vars.push_back(getPrivateItem(E));

   }

 }


 llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(

     CodeGenFunction &CGF, const OMPExecutableDirective &D,

     const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,

     const RegionCodeGenTy &CodeGen) {

   SourceLocation Loc = D.getBeginLoc();


   const RecordDecl *GlobalizedRD = nullptr;

   llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;

   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;

   unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;

   // Globalize team reductions variable unconditionally in all modes.

   if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)

     getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);

   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {

     getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);

     if (!LastPrivatesReductions.empty()) {

       GlobalizedRD = ::buildRecordForGlobalizedVars(

           CGM.getContext(), std::nullopt, LastPrivatesReductions,

           MappedDeclsFields, WarpSize);

     }

   } else if (!LastPrivatesReductions.empty()) {

     assert(!TeamAndReductions.first &&

            "Previous team declaration is not expected.");

     TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();

     std::swap(TeamAndReductions.second, LastPrivatesReductions);

   }


   // Emit target region as a standalone region.

   class NVPTXPrePostActionTy : public PrePostActionTy {

     SourceLocation &Loc;

     const RecordDecl *GlobalizedRD;

     llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &MappedDeclsFields;


   public:

     NVPTXPrePostActionTy(

         SourceLocation &Loc, const RecordDecl *GlobalizedRD,

         llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

             &MappedDeclsFields)

         : Loc(Loc), GlobalizedRD(GlobalizedRD),

           MappedDeclsFields(MappedDeclsFields) {}

     void Enter(CodeGenFunction &CGF) override {

       auto &Rt =

           static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

       if (GlobalizedRD) {

         auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;

         I->getSecond().MappedParams =

             std::make_unique<CodeGenFunction::OMPMapVars>();

         DeclToAddrMapTy &Data = I->getSecond().LocalVarData;

         for (const auto &Pair : MappedDeclsFields) {

           assert(Pair.getFirst()->isCanonicalDecl() &&

                  "Expected canonical declaration");

           Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));

         }

       }

       Rt.emitGenericVarsProlog(CGF, Loc);

     }

     void Exit(CodeGenFunction &CGF) override {

       static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())

           .emitGenericVarsEpilog(CGF);

     }

   } Action(Loc, GlobalizedRD, MappedDeclsFields);

   CodeGen.setAction(Action);

   llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(

       CGF, D, ThreadIDVar, InnermostKind, CodeGen);


   return OutlinedFun;

 }


 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,

                                                SourceLocation Loc) {

   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

     return;


   CGBuilderTy &Bld = CGF.Builder;


   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

   if (I == FunctionGlobalizedDecls.end())

     return;


   for (auto &Rec : I->getSecond().LocalVarData) {

     const auto *VD = cast<VarDecl>(Rec.first);

     bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);

     QualType VarTy = VD->getType();


     // Get the local allocation of a firstprivate variable before sharing

     llvm::Value *ParValue;

     if (EscapedParam) {

       LValue ParLVal =

           CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());

       ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);

     }


     // Allocate space for the variable to be globalized

     llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};

     llvm::CallBase *VoidPtr =

         CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                 CGM.getModule(), OMPRTL___kmpc_alloc_shared),

                             AllocArgs, VD->getName());

     // FIXME: We should use the variables actual alignment as an argument.

     VoidPtr->addRetAttr(llvm::Attribute::get(

         CGM.getLLVMContext(), llvm::Attribute::Alignment,

         CGM.getContext().getTargetInfo().getNewAlign() / 8));


     // Cast the void pointer and get the address of the globalized variable.

     llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();

     llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

         VoidPtr, VarPtrTy, VD->getName() + "_on_stack");

     LValue VarAddr =

         CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);

     Rec.second.PrivateAddr = VarAddr.getAddress();

     Rec.second.GlobalizedVal = VoidPtr;


     // Assign the local allocation to the newly globalized location.

     if (EscapedParam) {

       CGF.EmitStoreOfScalar(ParValue, VarAddr);

       I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());

     }

     if (auto *DI = CGF.getDebugInfo())

       VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));

   }


   for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {

     const auto *VD = cast<VarDecl>(ValueD);

     std::pair<llvm::Value *, llvm::Value *> AddrSizePair =

         getKmpcAllocShared(CGF, VD);

     I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);

     LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),

                                      CGM.getContext().getDeclAlign(VD),

                                      AlignmentSource::Decl);

     I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());

   }

   I->getSecond().MappedParams->apply(CGF);

 }


 bool CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl(CodeGenFunction &CGF,

                                                      const VarDecl *VD) const {

   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

   if (I == FunctionGlobalizedDecls.end())

     return false;


   // Check variable declaration is delayed:

   return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);

 }


 std::pair<llvm::Value *, llvm::Value *>

 CGOpenMPRuntimeGPU::getKmpcAllocShared(CodeGenFunction &CGF,

                                        const VarDecl *VD) {

   CGBuilderTy &Bld = CGF.Builder;


   // Compute size and alignment.

   llvm::Value *Size = CGF.getTypeSize(VD->getType());

   CharUnits Align = CGM.getContext().getDeclAlign(VD);

   Size = Bld.CreateNUWAdd(

       Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));

   llvm::Value *AlignVal =

       llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());

   Size = Bld.CreateUDiv(Size, AlignVal);

   Size = Bld.CreateNUWMul(Size, AlignVal);


   // Allocate space for this VLA object to be globalized.

   llvm::Value *AllocArgs[] = {Size};

   llvm::CallBase *VoidPtr =

       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                               CGM.getModule(), OMPRTL___kmpc_alloc_shared),

                           AllocArgs, VD->getName());

   VoidPtr->addRetAttr(llvm::Attribute::get(

       CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));


   return std::make_pair(VoidPtr, Size);

 }


 void CGOpenMPRuntimeGPU::getKmpcFreeShared(

     CodeGenFunction &CGF,

     const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {

   // Deallocate the memory for each globalized VLA object

   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                           CGM.getModule(), OMPRTL___kmpc_free_shared),

                       {AddrSizePair.first, AddrSizePair.second});

 }


 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {

   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

     return;


   const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

   if (I != FunctionGlobalizedDecls.end()) {

     // Deallocate the memory for each globalized VLA object that was

     // globalized in the prolog (i.e. emitGenericVarsProlog).

     for (const auto &AddrSizePair :

          llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {

       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                               CGM.getModule(), OMPRTL___kmpc_free_shared),

                           {AddrSizePair.first, AddrSizePair.second});

     }

     // Deallocate the memory for each globalized value

     for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {

       const auto *VD = cast<VarDecl>(Rec.first);

       I->getSecond().MappedParams->restore(CGF);


       llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,

                                  CGF.getTypeSize(VD->getType())};

       CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                               CGM.getModule(), OMPRTL___kmpc_free_shared),

                           FreeArgs);

     }

   }

 }


 void CGOpenMPRuntimeGPU::emitTeamsCall(CodeGenFunction &CGF,

                                          const OMPExecutableDirective &D,

                                          SourceLocation Loc,

                                          llvm::Function *OutlinedFn,

                                          ArrayRef<llvm::Value *> CapturedVars) {

   if (!CGF.HaveInsertPoint())

     return;


   bool IsBareKernel = D.getSingleClause<OMPXBareClause>();


   RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,

                                                          /*Name=*/".zero.addr");

   CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);

   llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;

   // We don't emit any thread id function call in bare kernel, but because the

   // outlined function has a pointer argument, we emit a nullptr here.

   if (IsBareKernel)

     OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));

   else

     OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));

   OutlinedFnArgs.push_back(ZeroAddr.getPointer());

   OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());

   emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);

 }


 void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,

                                           SourceLocation Loc,

                                           llvm::Function *OutlinedFn,

                                           ArrayRef<llvm::Value *> CapturedVars,

                                           const Expr *IfCond,

                                           llvm::Value *NumThreads) {

   if (!CGF.HaveInsertPoint())

     return;


   auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,

                         NumThreads](CodeGenFunction &CGF,

                                     PrePostActionTy &Action) {

     CGBuilderTy &Bld = CGF.Builder;

     llvm::Value *NumThreadsVal = NumThreads;

     llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];

     llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);

     if (WFn)

       ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);

     llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);


     // Create a private scope that will globalize the arguments

     // passed from the outside of the target region.

     // TODO: Is that needed?

     CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);


     Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(

         llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),

         "captured_vars_addrs");

     // There's something to share.

     if (!CapturedVars.empty()) {

       // Prepare for parallel region. Indicate the outlined function.

       ASTContext &Ctx = CGF.getContext();

       unsigned Idx = 0;

       for (llvm::Value *V : CapturedVars) {

         Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);

         llvm::Value *PtrV;

         if (V->getType()->isIntegerTy())

           PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);

         else

           PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);

         CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,

                               Ctx.getPointerType(Ctx.VoidPtrTy));

         ++Idx;

       }

     }


     llvm::Value *IfCondVal = nullptr;

     if (IfCond)

       IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,

                                     /* isSigned */ false);

     else

       IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);


     if (!NumThreadsVal)

       NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);

     else

       NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),


       assert(IfCondVal && "Expected a value");

     llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);

     llvm::Value *Args[] = {

         RTLoc,

         getThreadID(CGF, Loc),

         IfCondVal,

         NumThreadsVal,

         llvm::ConstantInt::get(CGF.Int32Ty, -1),

         FnPtr,

         ID,

         Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),

                                    CGF.VoidPtrPtrTy),

         llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};

     CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                             CGM.getModule(), OMPRTL___kmpc_parallel_51),

                         Args);

   };


   RegionCodeGenTy RCG(ParallelGen);

   RCG(CGF);

 }


 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {

   // Always emit simple barriers!

   if (!CGF.HaveInsertPoint())

     return;

   // Build call __kmpc_barrier_simple_spmd(nullptr, 0);

   // This function does not use parameters, so we can emit just default values.

   llvm::Value *Args[] = {

       llvm::ConstantPointerNull::get(

           cast<llvm::PointerType>(getIdentTyPointerTy())),

       llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};

   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                           CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),

                       Args);

 }


 void CGOpenMPRuntimeGPU::emitBarrierCall(CodeGenFunction &CGF,

                                            SourceLocation Loc,

                                            OpenMPDirectiveKind Kind, bool,

                                            bool) {

   // Always emit simple barriers!

   if (!CGF.HaveInsertPoint())

     return;

   // Build call __kmpc_cancel_barrier(loc, thread_id);

   unsigned Flags = getDefaultFlagsForBarriers(Kind);

   llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),

                          getThreadID(CGF, Loc)};


   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                           CGM.getModule(), OMPRTL___kmpc_barrier),

                       Args);

 }


 void CGOpenMPRuntimeGPU::emitCriticalRegion(

     CodeGenFunction &CGF, StringRef CriticalName,

     const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,

     const Expr *Hint) {

   llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");

   llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");

   llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");

   llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");

   llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");


   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());


   // Get the mask of active threads in the warp.

   llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

       CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));

   // Fetch team-local id of the thread.

   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);


   // Get the width of the team.

   llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);


   // Initialize the counter variable for the loop.

   QualType Int32Ty =

       CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);

   Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");

   LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);

   CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,

                         /*isInit=*/true);


   // Block checks if loop counter exceeds upper bound.

   CGF.EmitBlock(LoopBB);

   llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);

   llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);

   CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);


   // Block tests which single thread should execute region, and which threads

   // should go straight to synchronisation point.

   CGF.EmitBlock(TestBB);

   CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);

   llvm::Value *CmpThreadToCounter =

       CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);

   CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);


   // Block emits the body of the critical region.

   CGF.EmitBlock(BodyBB);


   // Output the critical statement.

   CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,

                                       Hint);


   // After the body surrounded by the critical region, the single executing

   // thread will jump to the synchronisation point.

   // Block waits for all threads in current team to finish then increments the

   // counter variable and returns to the loop.

   CGF.EmitBlock(SyncBB);

   // Reconverge active threads in the warp.

   (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                 CGM.getModule(), OMPRTL___kmpc_syncwarp),

                             Mask);


   llvm::Value *IncCounterVal =

       CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));

   CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);

   CGF.EmitBranch(LoopBB);


   // Block that is reached when  all threads in the team complete the region.

   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);

 }


 /// Cast value to the specified type.

 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,

                                     QualType ValTy, QualType CastTy,

                                     SourceLocation Loc) {

   assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&

          "Cast type must sized.");

   assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&

          "Val type must sized.");

   llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);

   if (ValTy == CastTy)

     return Val;

   if (CGF.getContext().getTypeSizeInChars(ValTy) ==

       CGF.getContext().getTypeSizeInChars(CastTy))

     return CGF.Builder.CreateBitCast(Val, LLVMCastTy);

   if (CastTy->isIntegerType() && ValTy->isIntegerType())

     return CGF.Builder.CreateIntCast(Val, LLVMCastTy,

                                      CastTy->hasSignedIntegerRepresentation());

   Address CastItem = CGF.CreateMemTemp(CastTy);

   Address ValCastItem = CastItem.withElementType(Val->getType());

   CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,

                         LValueBaseInfo(AlignmentSource::Type),

                         TBAAAccessInfo());

   return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,

                               LValueBaseInfo(AlignmentSource::Type),

                               TBAAAccessInfo());

 }


 /// This function creates calls to one of two shuffle functions to copy

 /// variables between lanes in a warp.

 static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,

                                                  llvm::Value *Elem,

                                                  QualType ElemType,

                                                  llvm::Value *Offset,

                                                  SourceLocation Loc) {

   CodeGenModule &CGM = CGF.CGM;

   CGBuilderTy &Bld = CGF.Builder;

   CGOpenMPRuntimeGPU &RT =

       *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));

   llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();


   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);

   assert(Size.getQuantity() <= 8 &&

          "Unsupported bitwidth in shuffle instruction.");


   RuntimeFunction ShuffleFn = Size.getQuantity() <= 4

                                   ? OMPRTL___kmpc_shuffle_int32

                                   : OMPRTL___kmpc_shuffle_int64;


   // Cast all types to 32- or 64-bit values before calling shuffle routines.

   QualType CastTy = CGF.getContext().getIntTypeForBitwidth(

       Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);

   llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);

   llvm::Value *WarpSize =

       Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);


   llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(

       OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),

       {ElemCast, Offset, WarpSize});


   return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);

 }


 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,

                             Address DestAddr, QualType ElemType,

                             llvm::Value *Offset, SourceLocation Loc) {

   CGBuilderTy &Bld = CGF.Builder;


   CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);

   // Create the loop over the big sized data.

   // ptr = (void*)Elem;

   // ptrEnd = (void*) Elem + 1;

   // Step = 8;

   // while (ptr + Step < ptrEnd)

   //   shuffle((int64_t)*ptr);

   // Step = 4;

   // while (ptr + Step < ptrEnd)

   //   shuffle((int32_t)*ptr);

   // ...

   Address ElemPtr = DestAddr;

   Address Ptr = SrcAddr;

   Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(

       Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);

   for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {

     if (Size < CharUnits::fromQuantity(IntSize))

       continue;

     QualType IntType = CGF.getContext().getIntTypeForBitwidth(

         CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),

         /*Signed=*/1);

     llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);

     Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),

                                                   IntTy);

     ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

         ElemPtr, IntTy->getPointerTo(), IntTy);

     if (Size.getQuantity() / IntSize > 1) {

       llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");

       llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");

       llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");

       llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();

       CGF.EmitBlock(PreCondBB);

       llvm::PHINode *PhiSrc =

           Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);

       PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);

       llvm::PHINode *PhiDest =

           Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);

       PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);

       Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());

       ElemPtr =

           Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());

       llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);

       llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);

       llvm::Value *PtrDiff = Bld.CreatePtrDiff(

           CGF.Int8Ty, PtrEndRaw,

           Bld.CreatePointerBitCastOrAddrSpaceCast(PtrRaw, CGF.VoidPtrTy));

       Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),

                        ThenBB, ExitBB);

       CGF.EmitBlock(ThenBB);

       llvm::Value *Res = createRuntimeShuffleFunction(

           CGF,

           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,

                                LValueBaseInfo(AlignmentSource::Type),

                                TBAAAccessInfo()),

           IntType, Offset, Loc);

       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,

                             LValueBaseInfo(AlignmentSource::Type),

                             TBAAAccessInfo());

       Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);

       Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);

       PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);

       PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);

       CGF.EmitBranch(PreCondBB);

       CGF.EmitBlock(ExitBB);

     } else {

       llvm::Value *Res = createRuntimeShuffleFunction(

           CGF,

           CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,

                                LValueBaseInfo(AlignmentSource::Type),

                                TBAAAccessInfo()),

           IntType, Offset, Loc);

       CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,

                             LValueBaseInfo(AlignmentSource::Type),

                             TBAAAccessInfo());

       Ptr = Bld.CreateConstGEP(Ptr, 1);

       ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);

     }

     Size = Size % IntSize;

   }

 }


 namespace {

 enum CopyAction : unsigned {

   // RemoteLaneToThread: Copy over a Reduce list from a remote lane in

   // the warp using shuffle instructions.

   RemoteLaneToThread,

   // ThreadCopy: Make a copy of a Reduce list on the thread's stack.

   ThreadCopy,

 };

 } // namespace


 struct CopyOptionsTy {

   llvm::Value *RemoteLaneOffset;

   llvm::Value *ScratchpadIndex;

   llvm::Value *ScratchpadWidth;

 };


 /// Emit instructions to copy a Reduce list, which contains partially

 /// aggregated values, in the specified direction.

 static void emitReductionListCopy(

     CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,

     ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,

     CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {


   CodeGenModule &CGM = CGF.CGM;

   ASTContext &C = CGM.getContext();

   CGBuilderTy &Bld = CGF.Builder;


   llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;


   // Iterates, element-by-element, through the source Reduce list and

   // make a copy.

   unsigned Idx = 0;

   for (const Expr *Private : Privates) {

     Address SrcElementAddr = Address::invalid();

     Address DestElementAddr = Address::invalid();

     Address DestElementPtrAddr = Address::invalid();

     // Should we shuffle in an element from a remote lane?

     bool ShuffleInElement = false;

     // Set to true to update the pointer in the dest Reduce list to a

     // newly created element.

     bool UpdateDestListPtr = false;

     QualType PrivatePtrType = C.getPointerType(Private->getType());

     llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);


     switch (Action) {

     case RemoteLaneToThread: {

       // Step 1.1: Get the address for the src element in the Reduce list.

       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);

       SrcElementAddr = CGF.EmitLoadOfPointer(

           SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),

           PrivatePtrType->castAs<PointerType>());


       // Step 1.2: Create a temporary to store the element in the destination

       // Reduce list.

       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);

       DestElementAddr =

           CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");

       ShuffleInElement = true;

       UpdateDestListPtr = true;

       break;

     }

     case ThreadCopy: {

       // Step 1.1: Get the address for the src element in the Reduce list.

       Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);

       SrcElementAddr = CGF.EmitLoadOfPointer(

           SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),

           PrivatePtrType->castAs<PointerType>());


       // Step 1.2: Get the address for dest element.  The destination

       // element has already been created on the thread's stack.

       DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);

       DestElementAddr = CGF.EmitLoadOfPointer(

           DestElementPtrAddr.withElementType(PrivateLlvmPtrType),

           PrivatePtrType->castAs<PointerType>());

       break;

     }

     }


     // Regardless of src and dest of copy, we emit the load of src

     // element as this is required in all directions

     SrcElementAddr = SrcElementAddr.withElementType(

         CGF.ConvertTypeForMem(Private->getType()));

     DestElementAddr =

         DestElementAddr.withElementType(SrcElementAddr.getElementType());


     // Now that all active lanes have read the element in the

     // Reduce list, shuffle over the value from the remote lane.

     if (ShuffleInElement) {

       shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),

                       RemoteLaneOffset, Private->getExprLoc());

     } else {

       switch (CGF.getEvaluationKind(Private->getType())) {

       case TEK_Scalar: {

         llvm::Value *Elem = CGF.EmitLoadOfScalar(

             SrcElementAddr, /*Volatile=*/false, Private->getType(),

             Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),

             TBAAAccessInfo());

         // Store the source element value to the dest element address.

         CGF.EmitStoreOfScalar(

             Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),

             LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

         break;

       }

       case TEK_Complex: {

         CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(

             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),

             Private->getExprLoc());

         CGF.EmitStoreOfComplex(

             Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),

             /*isInit=*/false);

         break;

       }

       case TEK_Aggregate:

         CGF.EmitAggregateCopy(

             CGF.MakeAddrLValue(DestElementAddr, Private->getType()),

             CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),

             Private->getType(), AggValueSlot::DoesNotOverlap);

         break;

       }

     }


     // Step 3.1: Modify reference in dest Reduce list as needed.

     // Modifying the reference in Reduce list to point to the newly

     // created element.  The element is live in the current function

     // scope and that of functions it invokes (i.e., reduce_function).

     // RemoteReduceData[i] = (void*)&RemoteElem

     if (UpdateDestListPtr) {

       CGF.EmitStoreOfScalar(

           Bld.CreatePointerBitCastOrAddrSpaceCast(

               DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),

           DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);

     }


     ++Idx;

   }

 }


 /// This function emits a helper that gathers Reduce lists from the first

 /// lane of every active warp to lanes in the first warp.

 ///

 /// void inter_warp_copy_func(void* reduce_data, num_warps)

 ///   shared smem[warp_size];

 ///   For all data entries D in reduce_data:

 ///     sync

 ///     If (I am the first lane in each warp)

 ///       Copy my local D to smem[warp_id]

 ///     sync

 ///     if (I am the first warp)

 ///       Copy smem[thread_id] to my local D

 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,

                                               ArrayRef<const Expr *> Privates,

                                               QualType ReductionArrayTy,

                                               SourceLocation Loc) {

   ASTContext &C = CGM.getContext();

   llvm::Module &M = CGM.getModule();


   // ReduceList: thread local Reduce list.

   // At the stage of the computation when this function is called, partially

   // aggregated values reside in the first lane of every active warp.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   // NumWarps: number of warps active in the parallel region.  This could

   // be smaller than 32 (max warps in a CTA) for partial block reduction.

   ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                 C.getIntTypeForBitwidth(32, /* Signed */ true),

                                 ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&ReduceListArg);

   Args.push_back(&NumWarpsArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),

                                     llvm::GlobalValue::InternalLinkage,

                                     "_omp_reduction_inter_warp_copy_func", &M);

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();

   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   // This array is used as a medium to transfer, one reduce element at a time,

   // the data from the first lane of every warp to lanes in the first warp

   // in order to perform the final step of a reduction in a parallel region

   // (reduction across warps).  The array is placed in NVPTX __shared__ memory

   // for reduced latency, as well as to have a distinct copy for concurrently

   // executing target regions.  The array is declared with common linkage so

   // as to be shared across compilation units.

   StringRef TransferMediumName =

       "__openmp_nvptx_data_transfer_temporary_storage";

   llvm::GlobalVariable *TransferMedium =

       M.getGlobalVariable(TransferMediumName);

   unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;

   if (!TransferMedium) {

     auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);

     unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);

     TransferMedium = new llvm::GlobalVariable(

         M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,

         llvm::UndefValue::get(Ty), TransferMediumName,

         /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,

         SharedAddressSpace);

     CGM.addCompilerUsedGlobal(TransferMedium);

   }


   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

   // Get the CUDA thread id of the current OpenMP thread on the GPU.

   llvm::Value *ThreadID = RT.getGPUThreadID(CGF);

   // nvptx_lane_id = nvptx_id % warpsize

   llvm::Value *LaneID = getNVPTXLaneID(CGF);

   // nvptx_warp_id = nvptx_id / warpsize

   llvm::Value *WarpID = getNVPTXWarpID(CGF);


   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

   Address LocalReduceList(

       Bld.CreatePointerBitCastOrAddrSpaceCast(

           CGF.EmitLoadOfScalar(

               AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,

               LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),

           ElemTy->getPointerTo()),

       ElemTy, CGF.getPointerAlign());


   unsigned Idx = 0;

   for (const Expr *Private : Privates) {

     //

     // Warp master copies reduce element to transfer medium in __shared__

     // memory.

     //

     unsigned RealTySize =

         C.getTypeSizeInChars(Private->getType())

             .alignTo(C.getTypeAlignInChars(Private->getType()))

             .getQuantity();

     for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {

       unsigned NumIters = RealTySize / TySize;

       if (NumIters == 0)

         continue;

       QualType CType = C.getIntTypeForBitwidth(

           C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);

       llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);

       CharUnits Align = CharUnits::fromQuantity(TySize);

       llvm::Value *Cnt = nullptr;

       Address CntAddr = Address::invalid();

       llvm::BasicBlock *PrecondBB = nullptr;

       llvm::BasicBlock *ExitBB = nullptr;

       if (NumIters > 1) {

         CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");

         CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,

                               /*Volatile=*/false, C.IntTy);

         PrecondBB = CGF.createBasicBlock("precond");

         ExitBB = CGF.createBasicBlock("exit");

         llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");

         // There is no need to emit line number for unconditional branch.

         (void)ApplyDebugLocation::CreateEmpty(CGF);

         CGF.EmitBlock(PrecondBB);

         Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);

         llvm::Value *Cmp =

             Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));

         Bld.CreateCondBr(Cmp, BodyBB, ExitBB);

         CGF.EmitBlock(BodyBB);

       }

       // kmpc_barrier.

       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,

                                              /*EmitChecks=*/false,

                                              /*ForceSimpleCall=*/true);

       llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

       llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

       llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");


       // if (lane_id == 0)

       llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");

       Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);

       CGF.EmitBlock(ThenBB);


       // Reduce element = LocalReduceList[i]

       Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

       llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

           ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

       // elemptr = ((CopyType*)(elemptrptr)) + I

       Address ElemPtr(ElemPtrPtr, CopyType, Align);

       if (NumIters > 1)

         ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);


       // Get pointer to location in transfer medium.

       // MediumPtr = &medium[warp_id]

       llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(

           TransferMedium->getValueType(), TransferMedium,

           {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});

       // Casting to actual data type.

       // MediumPtr = (CopyType*)MediumPtrAddr;

       Address MediumPtr(MediumPtrVal, CopyType, Align);


       // elem = *elemptr

       //*MediumPtr = elem

       llvm::Value *Elem = CGF.EmitLoadOfScalar(

           ElemPtr, /*Volatile=*/false, CType, Loc,

           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

       // Store the source element value to the dest element address.

       CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,

                             LValueBaseInfo(AlignmentSource::Type),

                             TBAAAccessInfo());


       Bld.CreateBr(MergeBB);


       CGF.EmitBlock(ElseBB);

       Bld.CreateBr(MergeBB);


       CGF.EmitBlock(MergeBB);


       // kmpc_barrier.

       CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,

                                              /*EmitChecks=*/false,

                                              /*ForceSimpleCall=*/true);


       //

       // Warp 0 copies reduce element from transfer medium.

       //

       llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");

       llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");

       llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");


       Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);

       llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(

           AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);


       // Up to 32 threads in warp 0 are active.

       llvm::Value *IsActiveThread =

           Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");

       Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);


       CGF.EmitBlock(W0ThenBB);


       // SrcMediumPtr = &medium[tid]

       llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(

           TransferMedium->getValueType(), TransferMedium,

           {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});

       // SrcMediumVal = *SrcMediumPtr;

       Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);


       // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I

       Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

       llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(

           TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);

       Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);

       if (NumIters > 1)

         TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);


       // *TargetElemPtr = SrcMediumVal;

       llvm::Value *SrcMediumValue =

           CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);

       CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,

                             CType);

       Bld.CreateBr(W0MergeBB);


       CGF.EmitBlock(W0ElseBB);

       Bld.CreateBr(W0MergeBB);


       CGF.EmitBlock(W0MergeBB);


       if (NumIters > 1) {

         Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));

         CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);

         CGF.EmitBranch(PrecondBB);

         (void)ApplyDebugLocation::CreateEmpty(CGF);

         CGF.EmitBlock(ExitBB);

       }

       RealTySize %= TySize;

     }

     ++Idx;

   }


   CGF.FinishFunction();

   return Fn;

 }


 /// Emit a helper that reduces data across two OpenMP threads (lanes)

 /// in the same warp.  It uses shuffle instructions to copy over data from

 /// a remote lane's stack.  The reduction algorithm performed is specified

 /// by the fourth parameter.

 ///

 /// Algorithm Versions.

 /// Full Warp Reduce (argument value 0):

 ///   This algorithm assumes that all 32 lanes are active and gathers

 ///   data from these 32 lanes, producing a single resultant value.

 /// Contiguous Partial Warp Reduce (argument value 1):

 ///   This algorithm assumes that only a *contiguous* subset of lanes

 ///   are active.  This happens for the last warp in a parallel region

 ///   when the user specified num_threads is not an integer multiple of

 ///   32.  This contiguous subset always starts with the zeroth lane.

 /// Partial Warp Reduce (argument value 2):

 ///   This algorithm gathers data from any number of lanes at any position.

 /// All reduced values are stored in the lowest possible lane.  The set

 /// of problems every algorithm addresses is a super set of those

 /// addressable by algorithms with a lower version number.  Overhead

 /// increases as algorithm version increases.

 ///

 /// Terminology

 /// Reduce element:

 ///   Reduce element refers to the individual data field with primitive

 ///   data types to be combined and reduced across threads.

 /// Reduce list:

 ///   Reduce list refers to a collection of local, thread-private

 ///   reduce elements.

 /// Remote Reduce list:

 ///   Remote Reduce list refers to a collection of remote (relative to

 ///   the current thread) reduce elements.

 ///

 /// We distinguish between three states of threads that are important to

 /// the implementation of this function.

 /// Alive threads:

 ///   Threads in a warp executing the SIMT instruction, as distinguished from

 ///   threads that are inactive due to divergent control flow.

 /// Active threads:

 ///   The minimal set of threads that has to be alive upon entry to this

 ///   function.  The computation is correct iff active threads are alive.

 ///   Some threads are alive but they are not active because they do not

 ///   contribute to the computation in any useful manner.  Turning them off

 ///   may introduce control flow overheads without any tangible benefits.

 /// Effective threads:

 ///   In order to comply with the argument requirements of the shuffle

 ///   function, we must keep all lanes holding data alive.  But at most

 ///   half of them perform value aggregation; we refer to this half of

 ///   threads as effective. The other half is simply handing off their

 ///   data.

 ///

 /// Procedure

 /// Value shuffle:

 ///   In this step active threads transfer data from higher lane positions

 ///   in the warp to lower lane positions, creating Remote Reduce list.

 /// Value aggregation:

 ///   In this step, effective threads combine their thread local Reduce list

 ///   with Remote Reduce list and store the result in the thread local

 ///   Reduce list.

 /// Value copy:

 ///   In this step, we deal with the assumption made by algorithm 2

 ///   (i.e. contiguity assumption).  When we have an odd number of lanes

 ///   active, say 2k+1, only k threads will be effective and therefore k

 ///   new values will be produced.  However, the Reduce list owned by the

 ///   (2k+1)th thread is ignored in the value aggregation.  Therefore

 ///   we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so

 ///   that the contiguity assumption still holds.

 static llvm::Function *emitShuffleAndReduceFunction(

     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

     QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {

   ASTContext &C = CGM.getContext();


   // Thread local Reduce list used to host the values of data to be reduced.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   // Current lane id; could be logical.

   ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,

                               ImplicitParamKind::Other);

   // Offset of the remote source lane relative to the current lane.

   ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                         C.ShortTy, ImplicitParamKind::Other);

   // Algorithm version.  This is expected to be known at compile time.

   ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                C.ShortTy, ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&ReduceListArg);

   Args.push_back(&LaneIDArg);

   Args.push_back(&RemoteLaneOffsetArg);

   Args.push_back(&AlgoVerArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();


   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

   Address LocalReduceList(

       Bld.CreatePointerBitCastOrAddrSpaceCast(

           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                                C.VoidPtrTy, SourceLocation()),

           ElemTy->getPointerTo()),

       ElemTy, CGF.getPointerAlign());


   Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);

   llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(

       AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


   Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);

   llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(

       AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


   Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);

   llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(

       AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());


   // Create a local thread-private variable to host the Reduce list

   // from a remote lane.

   Address RemoteReduceList =

       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");


   // This loop iterates through the list of reduce elements and copies,

   // element by element, from a remote lane in the warp to RemoteReduceList,

   // hosted on the thread's stack.

   emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,

                         LocalReduceList, RemoteReduceList,

                         {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,

                          /*ScratchpadIndex=*/nullptr,

                          /*ScratchpadWidth=*/nullptr});


   // The actions to be performed on the Remote Reduce list is dependent

   // on the algorithm version.

   //

   //  if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&

   //  LaneId % 2 == 0 && Offset > 0):

   //    do the reduction value aggregation

   //

   //  The thread local variable Reduce list is mutated in place to host the

   //  reduced data, which is the aggregated value produced from local and

   //  remote lanes.

   //

   //  Note that AlgoVer is expected to be a constant integer known at compile

   //  time.

   //  When AlgoVer==0, the first conjunction evaluates to true, making

   //    the entire predicate true during compile time.

   //  When AlgoVer==1, the second conjunction has only the second part to be

   //    evaluated during runtime.  Other conjunctions evaluates to false

   //    during compile time.

   //  When AlgoVer==2, the third conjunction has only the second part to be

   //    evaluated during runtime.  Other conjunctions evaluates to false

   //    during compile time.

   llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);


   llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

   llvm::Value *CondAlgo1 = Bld.CreateAnd(

       Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));


   llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));

   llvm::Value *CondAlgo2 = Bld.CreateAnd(

       Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));

   CondAlgo2 = Bld.CreateAnd(

       CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));


   llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);

   CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);


   llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");

   llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");

   llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");

   Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);


   CGF.EmitBlock(ThenBB);

   // reduce_function(LocalReduceList, RemoteReduceList)

   llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);

   llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);

   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

       CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});

   Bld.CreateBr(MergeBB);


   CGF.EmitBlock(ElseBB);

   Bld.CreateBr(MergeBB);


   CGF.EmitBlock(MergeBB);


   // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local

   // Reduce list.

   Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));

   llvm::Value *CondCopy = Bld.CreateAnd(

       Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));


   llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");

   llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");

   llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");

   Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);


   CGF.EmitBlock(CpyThenBB);

   emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,

                         RemoteReduceList, LocalReduceList);

   Bld.CreateBr(CpyMergeBB);


   CGF.EmitBlock(CpyElseBB);

   Bld.CreateBr(CpyMergeBB);


   CGF.EmitBlock(CpyMergeBB);


   CGF.FinishFunction();

   return Fn;

 }


 /// This function emits a helper that copies all the reduction variables from

 /// the team into the provided global buffer for the reduction variables.

 ///

 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)

 ///   For all data entries D in reduce_data:

 ///     Copy local D to buffer.D[Idx]

 static llvm::Value *emitListToGlobalCopyFunction(

     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

     QualType ReductionArrayTy, SourceLocation Loc,

     const RecordDecl *TeamReductionRec,

     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &VarFieldMap) {

   ASTContext &C = CGM.getContext();


   // Buffer: global reduction buffer.

   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                               C.VoidPtrTy, ImplicitParamKind::Other);

   // Idx: index of the buffer.

   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                            ImplicitParamKind::Other);

   // ReduceList: thread local Reduce list.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&BufferArg);

   Args.push_back(&IdxArg);

   Args.push_back(&ReduceListArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       "_omp_reduction_list_to_global_copy_func", &CGM.getModule());

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();

   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

   Address LocalReduceList(

       Bld.CreatePointerBitCastOrAddrSpaceCast(

           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                                C.VoidPtrTy, Loc),

           ElemTy->getPointerTo()),

       ElemTy, CGF.getPointerAlign());

   QualType StaticTy = C.getRecordType(TeamReductionRec);

   llvm::Type *LLVMReductionsBufferTy =

       CGM.getTypes().ConvertTypeForMem(StaticTy);

   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

       LLVMReductionsBufferTy->getPointerTo());

   llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                               /*Volatile=*/false, C.IntTy,

                                               Loc)};

   unsigned Idx = 0;

   for (const Expr *Private : Privates) {

     // Reduce element = LocalReduceList[i]

     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

     // elemptr = ((CopyType*)(elemptrptr)) + I

     ElemTy = CGF.ConvertTypeForMem(Private->getType());

     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

         ElemPtrPtr, ElemTy->getPointerTo());

     Address ElemPtr =

         Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));

     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();

     // Global = Buffer.VD[Idx];

     const FieldDecl *FD = VarFieldMap.lookup(VD);

     llvm::Value *BufferPtr =

         Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

     LValue GlobLVal = CGF.EmitLValueForField(

         CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

     Address GlobAddr = GlobLVal.getAddress();

     GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),

                                 CGF.ConvertTypeForMem(Private->getType()),

                                 GlobAddr.getAlignment()));

     switch (CGF.getEvaluationKind(Private->getType())) {

     case TEK_Scalar: {

       llvm::Value *V = CGF.EmitLoadOfScalar(

           ElemPtr, /*Volatile=*/false, Private->getType(), Loc,

           LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());

       CGF.EmitStoreOfScalar(V, GlobLVal);

       break;

     }

     case TEK_Complex: {

       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(

           CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);

       CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);

       break;

     }

     case TEK_Aggregate:

       CGF.EmitAggregateCopy(GlobLVal,

                             CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                             Private->getType(), AggValueSlot::DoesNotOverlap);

       break;

     }

     ++Idx;

   }


   CGF.FinishFunction();

   return Fn;

 }


 /// This function emits a helper that reduces all the reduction variables from

 /// the team into the provided global buffer for the reduction variables.

 ///

 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)

 ///  void *GlobPtrs[];

 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];

 ///  ...

 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];

 ///  reduce_function(GlobPtrs, reduce_data);

 static llvm::Value *emitListToGlobalReduceFunction(

     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

     QualType ReductionArrayTy, SourceLocation Loc,

     const RecordDecl *TeamReductionRec,

     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &VarFieldMap,

     llvm::Function *ReduceFn) {

   ASTContext &C = CGM.getContext();


   // Buffer: global reduction buffer.

   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                               C.VoidPtrTy, ImplicitParamKind::Other);

   // Idx: index of the buffer.

   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                            ImplicitParamKind::Other);

   // ReduceList: thread local Reduce list.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&BufferArg);

   Args.push_back(&IdxArg);

   Args.push_back(&ReduceListArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();

   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

   QualType StaticTy = C.getRecordType(TeamReductionRec);

   llvm::Type *LLVMReductionsBufferTy =

       CGM.getTypes().ConvertTypeForMem(StaticTy);

   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

       LLVMReductionsBufferTy->getPointerTo());


   // 1. Build a list of reduction variables.

   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

   RawAddress ReductionList =

       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

   auto IPriv = Privates.begin();

   llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                               /*Volatile=*/false, C.IntTy,

                                               Loc)};

   unsigned Idx = 0;

   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {

     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

     // Global = Buffer.VD[Idx];

     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();

     const FieldDecl *FD = VarFieldMap.lookup(VD);

     llvm::Value *BufferPtr =

         Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

     LValue GlobLVal = CGF.EmitLValueForField(

         CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

     Address GlobAddr = GlobLVal.getAddress();

     CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,

                           /*Volatile=*/false, C.VoidPtrTy);

     if ((*IPriv)->getType()->isVariablyModifiedType()) {

       // Store array size.

       ++Idx;

       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

       llvm::Value *Size = CGF.Builder.CreateIntCast(

           CGF.getVLASize(

                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

               .NumElts,

           CGF.SizeTy, /*isSigned=*/false);

       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                               Elem);

     }

   }


   // Call reduce_function(GlobalReduceList, ReduceList)

   llvm::Value *GlobalReduceList = ReductionList.getPointer();

   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(

       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

       CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});

   CGF.FinishFunction();

   return Fn;

 }


 /// This function emits a helper that copies all the reduction variables from

 /// the team into the provided global buffer for the reduction variables.

 ///

 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)

 ///   For all data entries D in reduce_data:

 ///     Copy buffer.D[Idx] to local D;

 static llvm::Value *emitGlobalToListCopyFunction(

     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

     QualType ReductionArrayTy, SourceLocation Loc,

     const RecordDecl *TeamReductionRec,

     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &VarFieldMap) {

   ASTContext &C = CGM.getContext();


   // Buffer: global reduction buffer.

   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                               C.VoidPtrTy, ImplicitParamKind::Other);

   // Idx: index of the buffer.

   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                            ImplicitParamKind::Other);

   // ReduceList: thread local Reduce list.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&BufferArg);

   Args.push_back(&IdxArg);

   Args.push_back(&ReduceListArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       "_omp_reduction_global_to_list_copy_func", &CGM.getModule());

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();

   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

   llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);

   Address LocalReduceList(

       Bld.CreatePointerBitCastOrAddrSpaceCast(

           CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,

                                C.VoidPtrTy, Loc),

           ElemTy->getPointerTo()),

       ElemTy, CGF.getPointerAlign());

   QualType StaticTy = C.getRecordType(TeamReductionRec);

   llvm::Type *LLVMReductionsBufferTy =

       CGM.getTypes().ConvertTypeForMem(StaticTy);

   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

       LLVMReductionsBufferTy->getPointerTo());


   llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                               /*Volatile=*/false, C.IntTy,

                                               Loc)};

   unsigned Idx = 0;

   for (const Expr *Private : Privates) {

     // Reduce element = LocalReduceList[i]

     Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);

     llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(

         ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());

     // elemptr = ((CopyType*)(elemptrptr)) + I

     ElemTy = CGF.ConvertTypeForMem(Private->getType());

     ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

         ElemPtrPtr, ElemTy->getPointerTo());

     Address ElemPtr =

         Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));

     const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();

     // Global = Buffer.VD[Idx];

     const FieldDecl *FD = VarFieldMap.lookup(VD);

     llvm::Value *BufferPtr =

         Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

     LValue GlobLVal = CGF.EmitLValueForField(

         CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

     Address GlobAddr = GlobLVal.getAddress();

     GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),

                                 CGF.ConvertTypeForMem(Private->getType()),

                                 GlobAddr.getAlignment()));

     switch (CGF.getEvaluationKind(Private->getType())) {

     case TEK_Scalar: {

       llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);

       CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),

                             LValueBaseInfo(AlignmentSource::Type),

                             TBAAAccessInfo());

       break;

     }

     case TEK_Complex: {

       CodeGenFunction::ComplexPairTy V = CGF.EmitLoadOfComplex(GlobLVal, Loc);

       CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                              /*isInit=*/false);

       break;

     }

     case TEK_Aggregate:

       CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),

                             GlobLVal, Private->getType(),

                             AggValueSlot::DoesNotOverlap);

       break;

     }

     ++Idx;

   }


   CGF.FinishFunction();

   return Fn;

 }


 /// This function emits a helper that reduces all the reduction variables from

 /// the team into the provided global buffer for the reduction variables.

 ///

 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)

 ///  void *GlobPtrs[];

 ///  GlobPtrs[0] = (void*)&buffer.D0[Idx];

 ///  ...

 ///  GlobPtrs[N] = (void*)&buffer.DN[Idx];

 ///  reduce_function(reduce_data, GlobPtrs);

 static llvm::Value *emitGlobalToListReduceFunction(

     CodeGenModule &CGM, ArrayRef<const Expr *> Privates,

     QualType ReductionArrayTy, SourceLocation Loc,

     const RecordDecl *TeamReductionRec,

     const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>

         &VarFieldMap,

     llvm::Function *ReduceFn) {

   ASTContext &C = CGM.getContext();


   // Buffer: global reduction buffer.

   ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                               C.VoidPtrTy, ImplicitParamKind::Other);

   // Idx: index of the buffer.

   ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,

                            ImplicitParamKind::Other);

   // ReduceList: thread local Reduce list.

   ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,

                                   C.VoidPtrTy, ImplicitParamKind::Other);

   FunctionArgList Args;

   Args.push_back(&BufferArg);

   Args.push_back(&IdxArg);

   Args.push_back(&ReduceListArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);

   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());

   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setDoesNotRecurse();

   CodeGenFunction CGF(CGM);

   CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);


   CGBuilderTy &Bld = CGF.Builder;


   Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);

   QualType StaticTy = C.getRecordType(TeamReductionRec);

   llvm::Type *LLVMReductionsBufferTy =

       CGM.getTypes().ConvertTypeForMem(StaticTy);

   llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(

       CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),

       LLVMReductionsBufferTy->getPointerTo());


   // 1. Build a list of reduction variables.

   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

   Address ReductionList =

       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

   auto IPriv = Privates.begin();

   llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),

                                               /*Volatile=*/false, C.IntTy,

                                               Loc)};

   unsigned Idx = 0;

   for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {

     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

     // Global = Buffer.VD[Idx];

     const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();

     const FieldDecl *FD = VarFieldMap.lookup(VD);

     llvm::Value *BufferPtr =

         Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);

     LValue GlobLVal = CGF.EmitLValueForField(

         CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);

     Address GlobAddr = GlobLVal.getAddress();

     CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,

                           /*Volatile=*/false, C.VoidPtrTy);

     if ((*IPriv)->getType()->isVariablyModifiedType()) {

       // Store array size.

       ++Idx;

       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

       llvm::Value *Size = CGF.Builder.CreateIntCast(

           CGF.getVLASize(

                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

               .NumElts,

           CGF.SizeTy, /*isSigned=*/false);

       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                               Elem);

     }

   }


   // Call reduce_function(ReduceList, GlobalReduceList)

   llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);

   Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);

   llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(

       AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);

   CGM.getOpenMPRuntime().emitOutlinedFunctionCall(

       CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});

   CGF.FinishFunction();

   return Fn;

 }


 ///

 /// Design of OpenMP reductions on the GPU

 ///

 /// Consider a typical OpenMP program with one or more reduction

 /// clauses:

 ///

 /// float foo;

 /// double bar;

 /// #pragma omp target teams distribute parallel for \

 ///             reduction(+:foo) reduction(*:bar)

 /// for (int i = 0; i < N; i++) {

 ///   foo += A[i]; bar *= B[i];

 /// }

 ///

 /// where 'foo' and 'bar' are reduced across all OpenMP threads in

 /// all teams.  In our OpenMP implementation on the NVPTX device an

 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads

 /// within a team are mapped to CUDA threads within a threadblock.

 /// Our goal is to efficiently aggregate values across all OpenMP

 /// threads such that:

 ///

 ///   - the compiler and runtime are logically concise, and

 ///   - the reduction is performed efficiently in a hierarchical

 ///     manner as follows: within OpenMP threads in the same warp,

 ///     across warps in a threadblock, and finally across teams on

 ///     the NVPTX device.

 ///

 /// Introduction to Decoupling

 ///

 /// We would like to decouple the compiler and the runtime so that the

 /// latter is ignorant of the reduction variables (number, data types)

 /// and the reduction operators.  This allows a simpler interface

 /// and implementation while still attaining good performance.

 ///

 /// Pseudocode for the aforementioned OpenMP program generated by the

 /// compiler is as follows:

 ///

 /// 1. Create private copies of reduction variables on each OpenMP

 ///    thread: 'foo_private', 'bar_private'

 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned

 ///    to it and writes the result in 'foo_private' and 'bar_private'

 ///    respectively.

 /// 3. Call the OpenMP runtime on the GPU to reduce within a team

 ///    and store the result on the team master:

 ///

 ///     __kmpc_nvptx_parallel_reduce_nowait_v2(...,

 ///        reduceData, shuffleReduceFn, interWarpCpyFn)

 ///

 ///     where:

 ///       struct ReduceData {

 ///         double *foo;

 ///         double *bar;

 ///       } reduceData

 ///       reduceData.foo = &foo_private

 ///       reduceData.bar = &bar_private

 ///

 ///     'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two

 ///     auxiliary functions generated by the compiler that operate on

 ///     variables of type 'ReduceData'.  They aid the runtime perform

 ///     algorithmic steps in a data agnostic manner.

 ///

 ///     'shuffleReduceFn' is a pointer to a function that reduces data

 ///     of type 'ReduceData' across two OpenMP threads (lanes) in the

 ///     same warp.  It takes the following arguments as input:

 ///

 ///     a. variable of type 'ReduceData' on the calling lane,

 ///     b. its lane_id,

 ///     c. an offset relative to the current lane_id to generate a

 ///        remote_lane_id.  The remote lane contains the second

 ///        variable of type 'ReduceData' that is to be reduced.

 ///     d. an algorithm version parameter determining which reduction

 ///        algorithm to use.

 ///

 ///     'shuffleReduceFn' retrieves data from the remote lane using

 ///     efficient GPU shuffle intrinsics and reduces, using the

 ///     algorithm specified by the 4th parameter, the two operands

 ///     element-wise.  The result is written to the first operand.

 ///

 ///     Different reduction algorithms are implemented in different

 ///     runtime functions, all calling 'shuffleReduceFn' to perform

 ///     the essential reduction step.  Therefore, based on the 4th

 ///     parameter, this function behaves slightly differently to

 ///     cooperate with the runtime to ensure correctness under

 ///     different circumstances.

 ///

 ///     'InterWarpCpyFn' is a pointer to a function that transfers

 ///     reduced variables across warps.  It tunnels, through CUDA

 ///     shared memory, the thread-private data of type 'ReduceData'

 ///     from lane 0 of each warp to a lane in the first warp.

 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.

 ///    The last team writes the global reduced value to memory.

 ///

 ///     ret = __kmpc_nvptx_teams_reduce_nowait(...,

 ///             reduceData, shuffleReduceFn, interWarpCpyFn,

 ///             scratchpadCopyFn, loadAndReduceFn)

 ///

 ///     'scratchpadCopyFn' is a helper that stores reduced

 ///     data from the team master to a scratchpad array in

 ///     global memory.

 ///

 ///     'loadAndReduceFn' is a helper that loads data from

 ///     the scratchpad array and reduces it with the input

 ///     operand.

 ///

 ///     These compiler generated functions hide address

 ///     calculation and alignment information from the runtime.

 /// 5. if ret == 1:

 ///     The team master of the last team stores the reduced

 ///     result to the globals in memory.

 ///     foo += reduceData.foo; bar *= reduceData.bar

 ///

 ///

 /// Warp Reduction Algorithms

 ///

 /// On the warp level, we have three algorithms implemented in the

 /// OpenMP runtime depending on the number of active lanes:

 ///

 /// Full Warp Reduction

 ///

 /// The reduce algorithm within a warp where all lanes are active

 /// is implemented in the runtime as follows:

 ///

 /// full_warp_reduce(void *reduce_data,

 ///                  kmp_ShuffleReductFctPtr ShuffleReduceFn) {

 ///   for (int offset = WARPSIZE/2; offset > 0; offset /= 2)

 ///     ShuffleReduceFn(reduce_data, 0, offset, 0);

 /// }

 ///

 /// The algorithm completes in log(2, WARPSIZE) steps.

 ///

 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is

 /// not used therefore we save instructions by not retrieving lane_id

 /// from the corresponding special registers.  The 4th parameter, which

 /// represents the version of the algorithm being used, is set to 0 to

 /// signify full warp reduction.

 ///

 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

 ///

 /// #reduce_elem refers to an element in the local lane's data structure

 /// #remote_elem is retrieved from a remote lane

 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;

 ///

 /// Contiguous Partial Warp Reduction

 ///

 /// This reduce algorithm is used within a warp where only the first

 /// 'n' (n <= WARPSIZE) lanes are active.  It is typically used when the

 /// number of OpenMP threads in a parallel region is not a multiple of

 /// WARPSIZE.  The algorithm is implemented in the runtime as follows:

 ///

 /// void

 /// contiguous_partial_reduce(void *reduce_data,

 ///                           kmp_ShuffleReductFctPtr ShuffleReduceFn,

 ///                           int size, int lane_id) {

 ///   int curr_size;

 ///   int offset;

 ///   curr_size = size;

 ///   mask = curr_size/2;

 ///   while (offset>0) {

 ///     ShuffleReduceFn(reduce_data, lane_id, offset, 1);

 ///     curr_size = (curr_size+1)/2;

 ///     offset = curr_size/2;

 ///   }

 /// }

 ///

 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

 ///

 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

 /// if (lane_id < offset)

 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem

 /// else

 ///     reduce_elem = remote_elem

 ///

 /// This algorithm assumes that the data to be reduced are located in a

 /// contiguous subset of lanes starting from the first.  When there is

 /// an odd number of active lanes, the data in the last lane is not

 /// aggregated with any other lane's dat but is instead copied over.

 ///

 /// Dispersed Partial Warp Reduction

 ///

 /// This algorithm is used within a warp when any discontiguous subset of

 /// lanes are active.  It is used to implement the reduction operation

 /// across lanes in an OpenMP simd region or in a nested parallel region.

 ///

 /// void

 /// dispersed_partial_reduce(void *reduce_data,

 ///                          kmp_ShuffleReductFctPtr ShuffleReduceFn) {

 ///   int size, remote_id;

 ///   int logical_lane_id = number_of_active_lanes_before_me() * 2;

 ///   do {

 ///       remote_id = next_active_lane_id_right_after_me();

 ///       # the above function returns 0 of no active lane

 ///       # is present right after the current lane.

 ///       size = number_of_active_lanes_in_this_warp();

 ///       logical_lane_id /= 2;

 ///       ShuffleReduceFn(reduce_data, logical_lane_id,

 ///                       remote_id-1-threadIdx.x, 2);

 ///   } while (logical_lane_id % 2 == 0 && size > 1);

 /// }

 ///

 /// There is no assumption made about the initial state of the reduction.

 /// Any number of lanes (>=1) could be active at any position.  The reduction

 /// result is returned in the first active lane.

 ///

 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:

 ///

 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);

 /// if (lane_id % 2 == 0 && offset > 0)

 ///     reduce_elem = reduce_elem REDUCE_OP remote_elem

 /// else

 ///     reduce_elem = remote_elem

 ///

 ///

 /// Intra-Team Reduction

 ///

 /// This function, as implemented in the runtime call

 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP

 /// threads in a team.  It first reduces within a warp using the

 /// aforementioned algorithms.  We then proceed to gather all such

 /// reduced values at the first warp.

 ///

 /// The runtime makes use of the function 'InterWarpCpyFn', which copies

 /// data from each of the "warp master" (zeroth lane of each warp, where

 /// warp-reduced data is held) to the zeroth warp.  This step reduces (in

 /// a mathematical sense) the problem of reduction across warp masters in

 /// a block to the problem of warp reduction.

 ///

 ///

 /// Inter-Team Reduction

 ///

 /// Once a team has reduced its data to a single value, it is stored in

 /// a global scratchpad array.  Since each team has a distinct slot, this

 /// can be done without locking.

 ///

 /// The last team to write to the scratchpad array proceeds to reduce the

 /// scratchpad array.  One or more workers in the last team use the helper

 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,

 /// the k'th worker reduces every k'th element.

 ///

 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to

 /// reduce across workers and compute a globally reduced value.

 ///

 void CGOpenMPRuntimeGPU::emitReduction(

     CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,

     ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,

     ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {

   if (!CGF.HaveInsertPoint())

     return;


   bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);

 #ifndef NDEBUG

   bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);

 #endif


   if (Options.SimpleReduction) {

     assert(!TeamsReduction && !ParallelReduction &&

            "Invalid reduction selection in emitReduction.");

     CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,

                                    ReductionOps, Options);

     return;

   }


   assert((TeamsReduction || ParallelReduction) &&

          "Invalid reduction selection in emitReduction.");


   llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;

   llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());

   int Cnt = 0;

   for (const Expr *DRE : Privates) {

     PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();

     ++Cnt;

   }


   ASTContext &C = CGM.getContext();

   const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(

       CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);


   // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),

   // RedList, shuffle_reduce_func, interwarp_copy_func);

   // or

   // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);

   llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);


   llvm::Value *Res;

   // 1. Build a list of reduction variables.

   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};

   auto Size = RHSExprs.size();

   for (const Expr *E : Privates) {

     if (E->getType()->isVariablyModifiedType())

       // Reserve place for array size.

       ++Size;

   }

   llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);

   QualType ReductionArrayTy = C.getConstantArrayType(

       C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,

       /*IndexTypeQuals=*/0);

   Address ReductionList =

       CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");

   auto IPriv = Privates.begin();

   unsigned Idx = 0;

   for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {

     Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

     CGF.Builder.CreateStore(

         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

             CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),

         Elem);

     if ((*IPriv)->getType()->isVariablyModifiedType()) {

       // Store array size.

       ++Idx;

       Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);

       llvm::Value *Size = CGF.Builder.CreateIntCast(

           CGF.getVLASize(

                  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))

               .NumElts,

           CGF.SizeTy, /*isSigned=*/false);

       CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),

                               Elem);

     }

   }


   llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

       ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);

   llvm::Function *ReductionFn = emitReductionFunction(

       CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),

       Privates, LHSExprs, RHSExprs, ReductionOps);

   llvm::Value *ReductionDataSize =

       CGF.getTypeSize(C.getRecordType(ReductionRec));

   ReductionDataSize =

       CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);

   llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(

       CGM, Privates, ReductionArrayTy, ReductionFn, Loc);

   llvm::Value *InterWarpCopyFn =

       emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);


   if (ParallelReduction) {

     llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,

                            InterWarpCopyFn};


     Res = CGF.EmitRuntimeCall(

         OMPBuilder.getOrCreateRuntimeFunction(

             CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),

         Args);

   } else {

     assert(TeamsReduction && "expected teams reduction.");

     TeamsReductions.push_back(ReductionRec);

     auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(

         OMPBuilder.getOrCreateRuntimeFunction(

             CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),

         {}, "_openmp_teams_reductions_buffer_$_$ptr");

     llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(

         CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);

     llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(

         CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,

         ReductionFn);

     llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(

         CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);

     llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(

         CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,

         ReductionFn);


     llvm::Value *Args[] = {

         RTLoc,

         KernelTeamsReductionPtr,

         CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),

         ReductionDataSize,

         RL,

         ShuffleAndReduceFn,

         InterWarpCopyFn,

         GlobalToBufferCpyFn,

         GlobalToBufferRedFn,

         BufferToGlobalCpyFn,

         BufferToGlobalRedFn};


     Res = CGF.EmitRuntimeCall(

         OMPBuilder.getOrCreateRuntimeFunction(

             CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),

         Args);

   }


   // 5. Build if (res == 1)

   llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");

   llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");

   llvm::Value *Cond = CGF.Builder.CreateICmpEQ(

       Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));

   CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);


   // 6. Build then branch: where we have reduced values in the master

   //    thread in each team.

   //    __kmpc_end_reduce{_nowait}(<gtid>);

   //    break;

   CGF.EmitBlock(ThenBB);


   // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);

   auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,

                     this](CodeGenFunction &CGF, PrePostActionTy &Action) {

     auto IPriv = Privates.begin();

     auto ILHS = LHSExprs.begin();

     auto IRHS = RHSExprs.begin();

     for (const Expr *E : ReductionOps) {

       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),

                                   cast<DeclRefExpr>(*IRHS));

       ++IPriv;

       ++ILHS;

       ++IRHS;

     }

   };

   RegionCodeGenTy RCG(CodeGen);

   RCG(CGF);

   // There is no need to emit line number for unconditional branch.

   (void)ApplyDebugLocation::CreateEmpty(CGF);

   CGF.EmitBlock(ExitBB, /*IsFinished=*/true);

 }


 const VarDecl *

 CGOpenMPRuntimeGPU::translateParameter(const FieldDecl *FD,

                                        const VarDecl *NativeParam) const {

   if (!NativeParam->getType()->isReferenceType())

     return NativeParam;

   QualType ArgType = NativeParam->getType();

   QualifierCollector QC;

   const Type *NonQualTy = QC.strip(ArgType);

   QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

   if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {

     if (Attr->getCaptureKind() == OMPC_map) {

       PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,

                                                         LangAS::opencl_global);

     }

   }

   ArgType = CGM.getContext().getPointerType(PointeeTy);

   QC.addRestrict();

   enum { NVPTX_local_addr = 5 };

   QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));

   ArgType = QC.apply(CGM.getContext(), ArgType);

   if (isa<ImplicitParamDecl>(NativeParam))

     return ImplicitParamDecl::Create(

         CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),

         NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);

   return ParmVarDecl::Create(

       CGM.getContext(),

       const_cast<DeclContext *>(NativeParam->getDeclContext()),

       NativeParam->getBeginLoc(), NativeParam->getLocation(),

       NativeParam->getIdentifier(), ArgType,

       /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);

 }


 Address

 CGOpenMPRuntimeGPU::getParameterAddress(CodeGenFunction &CGF,

                                           const VarDecl *NativeParam,

                                           const VarDecl *TargetParam) const {

   assert(NativeParam != TargetParam &&

          NativeParam->getType()->isReferenceType() &&

          "Native arg must not be the same as target arg.");

   Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);

   QualType NativeParamType = NativeParam->getType();

   QualifierCollector QC;

   const Type *NonQualTy = QC.strip(NativeParamType);

   QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();

   unsigned NativePointeeAddrSpace =

       CGF.getTypes().getTargetAddressSpace(NativePointeeTy);

   QualType TargetTy = TargetParam->getType();

   llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,

                                                  TargetTy, SourceLocation());

   // Cast to native address space.

   TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

       TargetAddr,

       llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));

   Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);

   CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,

                         NativeParamType);

   return NativeParamAddr;

 }


 void CGOpenMPRuntimeGPU::emitOutlinedFunctionCall(

     CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,

     ArrayRef<llvm::Value *> Args) const {

   SmallVector<llvm::Value *, 4> TargetArgs;

   TargetArgs.reserve(Args.size());

   auto *FnType = OutlinedFn.getFunctionType();

   for (unsigned I = 0, E = Args.size(); I < E; ++I) {

     if (FnType->isVarArg() && FnType->getNumParams() <= I) {

       TargetArgs.append(std::next(Args.begin(), I), Args.end());

       break;

     }

     llvm::Type *TargetType = FnType->getParamType(I);

     llvm::Value *NativeArg = Args[I];

     if (!TargetType->isPointerTy()) {

       TargetArgs.emplace_back(NativeArg);

       continue;

     }

     TargetArgs.emplace_back(

         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));

   }

   CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);

 }


 /// Emit function which wraps the outline parallel region

 /// and controls the arguments which are passed to this function.

 /// The wrapper ensures that the outlined function is called

 /// with the correct arguments when data is shared.

 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(

     llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {

   ASTContext &Ctx = CGM.getContext();

   const auto &CS = *D.getCapturedStmt(OMPD_parallel);


   // Create a function that takes as argument the source thread.

   FunctionArgList WrapperArgs;

   QualType Int16QTy =

       Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);

   QualType Int32QTy =

       Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);

   ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),

                                      /*Id=*/nullptr, Int16QTy,

                                      ImplicitParamKind::Other);

   ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),

                                /*Id=*/nullptr, Int32QTy,

                                ImplicitParamKind::Other);

   WrapperArgs.emplace_back(&ParallelLevelArg);

   WrapperArgs.emplace_back(&WrapperArg);


   const CGFunctionInfo &CGFI =

       CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);


   auto *Fn = llvm::Function::Create(

       CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,

       Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());


   // Ensure we do not inline the function. This is trivially true for the ones

   // passed to __kmpc_fork_call but the ones calles in serialized regions

   // could be inlined. This is not a perfect but it is closer to the invariant

   // we want, namely, every data environment starts with a new function.

   // TODO: We should pass the if condition to the runtime function and do the

   //       handling there. Much cleaner code.

   Fn->addFnAttr(llvm::Attribute::NoInline);


   CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);

   Fn->setLinkage(llvm::GlobalValue::InternalLinkage);

   Fn->setDoesNotRecurse();


   CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);

   CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,

                     D.getBeginLoc(), D.getBeginLoc());


   const auto *RD = CS.getCapturedRecordDecl();

   auto CurField = RD->field_begin();


   Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,

                                                       /*Name=*/".zero.addr");

   CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);

   // Get the array of arguments.

   SmallVector<llvm::Value *, 8> Args;


   Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));

   Args.emplace_back(ZeroAddr.emitRawPointer(CGF));


   CGBuilderTy &Bld = CGF.Builder;

   auto CI = CS.capture_begin();


   // Use global memory for data sharing.

   // Handle passing of global args to workers.

   RawAddress GlobalArgs =

       CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");

   llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();

   llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};

   CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                           CGM.getModule(), OMPRTL___kmpc_get_shared_variables),

                       DataSharingArgs);


   // Retrieve the shared variables from the list of references returned

   // by the runtime. Pass the variables to the outlined function.

   Address SharedArgListAddress = Address::invalid();

   if (CS.capture_size() > 0 ||

       isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {

     SharedArgListAddress = CGF.EmitLoadOfPointer(

         GlobalArgs, CGF.getContext()

                         .getPointerType(CGF.getContext().VoidPtrTy)

                         .castAs<PointerType>());

   }

   unsigned Idx = 0;

   if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {

     Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);

     Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

         Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);

     llvm::Value *LB = CGF.EmitLoadOfScalar(

         TypedAddress,

         /*Volatile=*/false,

         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),

         cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());

     Args.emplace_back(LB);

     ++Idx;

     Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);

     TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

         Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);

     llvm::Value *UB = CGF.EmitLoadOfScalar(

         TypedAddress,

         /*Volatile=*/false,

         CGF.getContext().getPointerType(CGF.getContext().getSizeType()),

         cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());

     Args.emplace_back(UB);

     ++Idx;

   }

   if (CS.capture_size() > 0) {

     ASTContext &CGFContext = CGF.getContext();

     for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {

       QualType ElemTy = CurField->getType();

       Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);

       Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(

           Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),

           CGF.ConvertTypeForMem(ElemTy));

       llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,

                                               /*Volatile=*/false,

                                               CGFContext.getPointerType(ElemTy),

                                               CI->getLocation());

       if (CI->capturesVariableByCopy() &&

           !CI->getCapturedVar()->getType()->isAnyPointerType()) {

         Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),

                               CI->getLocation());

       }

       Args.emplace_back(Arg);

     }

   }


   emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);

   CGF.FinishFunction();

   return Fn;

 }


 void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,

                                               const Decl *D) {

   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

     return;


   assert(D && "Expected function or captured|block decl.");

   assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&

          "Function is registered already.");

   assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&

          "Team is set but not processed.");

   const Stmt *Body = nullptr;

   bool NeedToDelayGlobalization = false;

   if (const auto *FD = dyn_cast<FunctionDecl>(D)) {

     Body = FD->getBody();

   } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {

     Body = BD->getBody();

   } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {

     Body = CD->getBody();

     NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;

     if (NeedToDelayGlobalization &&

         getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)

       return;

   }

   if (!Body)

     return;

   CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);

   VarChecker.Visit(Body);

   const RecordDecl *GlobalizedVarsRecord =

       VarChecker.getGlobalizedRecord(IsInTTDRegion);

   TeamAndReductions.first = nullptr;

   TeamAndReductions.second.clear();

   ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =

       VarChecker.getEscapedVariableLengthDecls();

   ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =

       VarChecker.getDelayedVariableLengthDecls();

   if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&

       DelayedVariableLengthDecls.empty())

     return;

   auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;

   I->getSecond().MappedParams =

       std::make_unique<CodeGenFunction::OMPMapVars>();

   I->getSecond().EscapedParameters.insert(

       VarChecker.getEscapedParameters().begin(),

       VarChecker.getEscapedParameters().end());

   I->getSecond().EscapedVariableLengthDecls.append(

       EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());

   I->getSecond().DelayedVariableLengthDecls.append(

       DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());

   DeclToAddrMapTy &Data = I->getSecond().LocalVarData;

   for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {

     assert(VD->isCanonicalDecl() && "Expected canonical declaration");

     Data.insert(std::make_pair(VD, MappedVarData()));

   }

   if (!NeedToDelayGlobalization) {

     emitGenericVarsProlog(CGF, D->getBeginLoc());

     struct GlobalizationScope final : EHScopeStack::Cleanup {

       GlobalizationScope() = default;


       void Emit(CodeGenFunction &CGF, Flags flags) override {

         static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())

             .emitGenericVarsEpilog(CGF);

       }

     };

     CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);

   }

 }


 Address CGOpenMPRuntimeGPU::getAddressOfLocalVariable(CodeGenFunction &CGF,

                                                         const VarDecl *VD) {

   if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {

     const auto *A = VD->getAttr<OMPAllocateDeclAttr>();

     auto AS = LangAS::Default;

     switch (A->getAllocatorType()) {

       // Use the default allocator here as by default local vars are

       // threadlocal.

     case OMPAllocateDeclAttr::OMPNullMemAlloc:

     case OMPAllocateDeclAttr::OMPDefaultMemAlloc:

     case OMPAllocateDeclAttr::OMPThreadMemAlloc:

     case OMPAllocateDeclAttr::OMPHighBWMemAlloc:

     case OMPAllocateDeclAttr::OMPLowLatMemAlloc:

       // Follow the user decision - use default allocation.

       return Address::invalid();

     case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:

       // TODO: implement aupport for user-defined allocators.

       return Address::invalid();

     case OMPAllocateDeclAttr::OMPConstMemAlloc:

       AS = LangAS::cuda_constant;

       break;

     case OMPAllocateDeclAttr::OMPPTeamMemAlloc:

       AS = LangAS::cuda_shared;

       break;

     case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:

     case OMPAllocateDeclAttr::OMPCGroupMemAlloc:

       break;

     }

     llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());

     auto *GV = new llvm::GlobalVariable(

         CGM.getModule(), VarTy, /*isConstant=*/false,

         llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),

         VD->getName(),

         /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,

         CGM.getContext().getTargetAddressSpace(AS));

     CharUnits Align = CGM.getContext().getDeclAlign(VD);

     GV->setAlignment(Align.getAsAlign());

     return Address(

         CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(

             GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(

                     VD->getType().getAddressSpace()))),

         VarTy, Align);

   }


   if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)

     return Address::invalid();


   VD = VD->getCanonicalDecl();

   auto I = FunctionGlobalizedDecls.find(CGF.CurFn);

   if (I == FunctionGlobalizedDecls.end())

     return Address::invalid();

   auto VDI = I->getSecond().LocalVarData.find(VD);

   if (VDI != I->getSecond().LocalVarData.end())

     return VDI->second.PrivateAddr;

   if (VD->hasAttrs()) {

     for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),

          E(VD->attr_end());

          IT != E; ++IT) {

       auto VDI = I->getSecond().LocalVarData.find(

           cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())

               ->getCanonicalDecl());

       if (VDI != I->getSecond().LocalVarData.end())

         return VDI->second.PrivateAddr;

     }

   }


   return Address::invalid();

 }


 void CGOpenMPRuntimeGPU::functionFinished(CodeGenFunction &CGF) {

   FunctionGlobalizedDecls.erase(CGF.CurFn);

   CGOpenMPRuntime::functionFinished(CGF);

 }


 void CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk(

     CodeGenFunction &CGF, const OMPLoopDirective &S,

     OpenMPDistScheduleClauseKind &ScheduleKind,

     llvm::Value *&Chunk) const {

   auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());

   if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {

     ScheduleKind = OMPC_DIST_SCHEDULE_static;

     Chunk = CGF.EmitScalarConversion(

         RT.getGPUNumThreads(CGF),

         CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),

         S.getIterationVariable()->getType(), S.getBeginLoc());

     return;

   }

   CGOpenMPRuntime::getDefaultDistScheduleAndChunk(

       CGF, S, ScheduleKind, Chunk);

 }


 void CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk(

     CodeGenFunction &CGF, const OMPLoopDirective &S,

     OpenMPScheduleClauseKind &ScheduleKind,

     const Expr *&ChunkExpr) const {

   ScheduleKind = OMPC_SCHEDULE_static;

   // Chunk size is 1 in this case.

   llvm::APInt ChunkSize(32, 1);

   ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,

       CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),

       SourceLocation());

 }


 void CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas(

     CodeGenFunction &CGF, const OMPExecutableDirective &D) const {

   assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&

          " Expected target-based directive.");

   const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);

   for (const CapturedStmt::Capture &C : CS->captures()) {

     // Capture variables captured by reference in lambdas for target-based

     // directives.

     if (!C.capturesVariable())

       continue;

     const VarDecl *VD = C.getCapturedVar();

     const auto *RD = VD->getType()

                          .getCanonicalType()

                          .getNonReferenceType()

                          ->getAsCXXRecordDecl();

     if (!RD || !RD->isLambda())

       continue;

     Address VDAddr = CGF.GetAddrOfLocalVar(VD);

     LValue VDLVal;

     if (VD->getType().getCanonicalType()->isReferenceType())

       VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());

     else

       VDLVal = CGF.MakeAddrLValue(

           VDAddr, VD->getType().getCanonicalType().getNonReferenceType());

     llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;

     FieldDecl *ThisCapture = nullptr;

     RD->getCaptureFields(Captures, ThisCapture);

     if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {

       LValue ThisLVal =

           CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);

       llvm::Value *CXXThis = CGF.LoadCXXThis();

       CGF.EmitStoreOfScalar(CXXThis, ThisLVal);

     }

     for (const LambdaCapture &LC : RD->captures()) {

       if (LC.getCaptureKind() != LCK_ByRef)

         continue;

       const ValueDecl *VD = LC.getCapturedVar();

       // FIXME: For now VD is always a VarDecl because OpenMP does not support

       //  capturing structured bindings in lambdas yet.

       if (!CS->capturesVariable(cast<VarDecl>(VD)))

         continue;

       auto It = Captures.find(VD);

       assert(It != Captures.end() && "Found lambda capture without field.");

       LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);

       Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));

       if (VD->getType().getCanonicalType()->isReferenceType())

         VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,

                                                VD->getType().getCanonicalType())

                      .getAddress();

       CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);

     }

   }

 }


 bool CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar(const VarDecl *VD,

                                                             LangAS &AS) {

   if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())

     return false;

   const auto *A = VD->getAttr<OMPAllocateDeclAttr>();

   switch(A->getAllocatorType()) {

   case OMPAllocateDeclAttr::OMPNullMemAlloc:

   case OMPAllocateDeclAttr::OMPDefaultMemAlloc:

   // Not supported, fallback to the default mem space.

   case OMPAllocateDeclAttr::OMPThreadMemAlloc:

   case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:

   case OMPAllocateDeclAttr::OMPCGroupMemAlloc:

   case OMPAllocateDeclAttr::OMPHighBWMemAlloc:

   case OMPAllocateDeclAttr::OMPLowLatMemAlloc:

     AS = LangAS::Default;

     return true;

   case OMPAllocateDeclAttr::OMPConstMemAlloc:

     AS = LangAS::cuda_constant;

     return true;

   case OMPAllocateDeclAttr::OMPPTeamMemAlloc:

     AS = LangAS::cuda_shared;

     return true;

   case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:

     llvm_unreachable("Expected predefined allocator for the variables with the "

                      "static storage.");

   }

   return false;

 }


 // Get current CudaArch and ignore any unknown values

 static CudaArch getCudaArch(CodeGenModule &CGM) {

   if (!CGM.getTarget().hasFeature("ptx"))

     return CudaArch::UNKNOWN;

   for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {

     if (Feature.getValue()) {

       CudaArch Arch = StringToCudaArch(Feature.getKey());

       if (Arch != CudaArch::UNKNOWN)

         return Arch;

     }

   }

   return CudaArch::UNKNOWN;

 }


 /// Check to see if target architecture supports unified addressing which is

 /// a restriction for OpenMP requires clause "unified_shared_memory".

 void CGOpenMPRuntimeGPU::processRequiresDirective(

     const OMPRequiresDecl *D) {

   for (const OMPClause *Clause : D->clauselists()) {

     if (Clause->getClauseKind() == OMPC_unified_shared_memory) {

       CudaArch Arch = getCudaArch(CGM);

       switch (Arch) {

       case CudaArch::SM_20:

       case CudaArch::SM_21:

       case CudaArch::SM_30:

       case CudaArch::SM_32_:

       case CudaArch::SM_35:

       case CudaArch::SM_37:

       case CudaArch::SM_50:

       case CudaArch::SM_52:

       case CudaArch::SM_53: {

         SmallString<256> Buffer;

         llvm::raw_svector_ostream Out(Buffer);

         Out << "Target architecture " << CudaArchToString(Arch)

             << " does not support unified addressing";

         CGM.Error(Clause->getBeginLoc(), Out.str());

         return;

       }

       case CudaArch::SM_60:

       case CudaArch::SM_61:

       case CudaArch::SM_62:

       case CudaArch::SM_70:

       case CudaArch::SM_72:

       case CudaArch::SM_75:

       case CudaArch::SM_80:

       case CudaArch::SM_86:

       case CudaArch::SM_87:

       case CudaArch::SM_89:

       case CudaArch::SM_90:

       case CudaArch::SM_90a:

       case CudaArch::GFX600:

       case CudaArch::GFX601:

       case CudaArch::GFX602:

       case CudaArch::GFX700:

       case CudaArch::GFX701:

       case CudaArch::GFX702:

       case CudaArch::GFX703:

       case CudaArch::GFX704:

       case CudaArch::GFX705:

       case CudaArch::GFX801:

       case CudaArch::GFX802:

       case CudaArch::GFX803:

       case CudaArch::GFX805:

       case CudaArch::GFX810:

       case CudaArch::GFX900:

       case CudaArch::GFX902:

       case CudaArch::GFX904:

       case CudaArch::GFX906:

       case CudaArch::GFX908:

       case CudaArch::GFX909:

       case CudaArch::GFX90a:

       case CudaArch::GFX90c:

       case CudaArch::GFX940:

       case CudaArch::GFX941:

       case CudaArch::GFX942:

       case CudaArch::GFX1010:

       case CudaArch::GFX1011:

       case CudaArch::GFX1012:

       case CudaArch::GFX1013:

       case CudaArch::GFX1030:

       case CudaArch::GFX1031:

       case CudaArch::GFX1032:

       case CudaArch::GFX1033:

       case CudaArch::GFX1034:

       case CudaArch::GFX1035:

       case CudaArch::GFX1036:

       case CudaArch::GFX1100:

       case CudaArch::GFX1101:

       case CudaArch::GFX1102:

       case CudaArch::GFX1103:

       case CudaArch::GFX1150:

       case CudaArch::GFX1151:

       case CudaArch::GFX1200:

       case CudaArch::GFX1201:

       case CudaArch::Generic:

       case CudaArch::UNUSED:

       case CudaArch::UNKNOWN:

         break;

       case CudaArch::LAST:

         llvm_unreachable("Unexpected Cuda arch.");

       }

     }

   }

   CGOpenMPRuntime::processRequiresDirective(D);

 }


 llvm::Value *CGOpenMPRuntimeGPU::getGPUNumThreads(CodeGenFunction &CGF) {

   CGBuilderTy &Bld = CGF.Builder;

   llvm::Module *M = &CGF.CGM.getModule();

   const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";

   llvm::Function *F = M->getFunction(LocSize);

   if (!F) {

     F = llvm::Function::Create(

         llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),

         llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());

   }

   return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");

 }


 llvm::Value *CGOpenMPRuntimeGPU::getGPUThreadID(CodeGenFunction &CGF) {

   ArrayRef<llvm::Value *> Args{};

   return CGF.EmitRuntimeCall(

       OMPBuilder.getOrCreateRuntimeFunction(

           CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),

       Args);

 }


 llvm::Value *CGOpenMPRuntimeGPU::getGPUWarpSize(CodeGenFunction &CGF) {

   ArrayRef<llvm::Value *> Args{};

   return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(

                                  CGM.getModule(), OMPRTL___kmpc_get_warp_size),

                              Args);

 }

V
#define V(N, I)
Definition: ASTContext.h:3299

ID
static char ID
Definition: Arena.cpp:183

Attr.h

getCudaArch
static CudaArch getCudaArch(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:3445

emitGlobalToListCopyFunction
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2364

emitInterWarpCopyFunction
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
Definition: CGOpenMPRuntimeGPU.cpp:1708

getTeamsReductionVars
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
Definition: CGOpenMPRuntimeGPU.cpp:986

emitReductionListCopy
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
Definition: CGOpenMPRuntimeGPU.cpp:1577

getNVPTXLaneID
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
Definition: CGOpenMPRuntimeGPU.cpp:518

getDistributeLastprivateVars
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ...
Definition: CGOpenMPRuntimeGPU.cpp:961

emitListToGlobalReduceFunction
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2269

shuffleAndStore
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
Definition: CGOpenMPRuntimeGPU.cpp:1473

hasNestedSPMDDirective
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
Definition: CGOpenMPRuntimeGPU.cpp:540

getNVPTXWarpID
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
Definition: CGOpenMPRuntimeGPU.cpp:507

supportsSPMDExecutionMode
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
Definition: CGOpenMPRuntimeGPU.cpp:642

createRuntimeShuffleFunction
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
Definition: CGOpenMPRuntimeGPU.cpp:1440

castValueToType
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
Definition: CGOpenMPRuntimeGPU.cpp:1412

emitGlobalToListReduceFunction
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
Definition: CGOpenMPRuntimeGPU.cpp:2476

emitShuffleAndReduceFunction
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
Definition: CGOpenMPRuntimeGPU.cpp:2000

emitListToGlobalCopyFunction
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
Definition: CGOpenMPRuntimeGPU.cpp:2158

CGOpenMPRuntimeGPU.h

CodeGenFunction.h

DeclOpenMP.h
This file defines OpenMP nodes for declarative directives.

Offset
unsigned Offset
Definition: Format.cpp:2978

OpenMPClause.h
This file defines OpenMP AST classes for clauses.

Loc
SourceLocation Loc
Definition: SemaObjC.cpp:755

getPrivateItem
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
Definition: SemaOpenMP.cpp:5409

Data
const char * Data
Definition: StandardLibrary.cpp:36

StmtOpenMP.h
This file defines OpenMP AST classes for executable directives and clauses.

StmtVisitor.h

Base

clang::ASTContext
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:185

clang::ASTContext::getPointerType
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
Definition: ASTContext.cpp:3322

clang::ASTContext::VoidPtrTy
CanQualType VoidPtrTy
Definition: ASTContext.h:1121

clang::ASTContext::getUIntPtrType
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
Definition: ASTContext.cpp:6066

clang::ASTContext::getIntTypeForBitwidth
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
Definition: ASTContext.cpp:12253

clang::ASTContext::getAsVariableArrayType
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2785

clang::ASTContext::getSizeType
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
Definition: ASTContext.cpp:6028

clang::ASTContext::getDeclAlign
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
Definition: ASTContext.cpp:1639

clang::ASTContext::toBits
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
Definition: ASTContext.cpp:2433

clang::ASTContext::getTypeSizeInChars
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
Definition: ASTContext.cpp:2439

clang::ASTContext::VoidTy
CanQualType VoidTy
Definition: ASTContext.h:1094

clang::ASTContext::getTargetInfo
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:760

clang::ASTContext::getAddrSpaceQualType
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
Definition: ASTContext.cpp:3043

clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(LangAS AS) const
Definition: ASTContext.cpp:12535

clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:46

clang::BlockDecl::Capture
A class which contains all the information about a particular captured value.
Definition: Decl.h:4503

clang::BlockDecl::captures
ArrayRef< Capture > captures() const
Definition: Decl.h:4624

clang::BlockExpr
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6214

clang::BlockExpr::getBlockDecl
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6226

clang::CallExpr
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2872

clang::CallExpr::arguments
arg_range arguments()
Definition: Expr.h:3111

clang::CallExpr::getCallee
Expr * getCallee()
Definition: Expr.h:3022

clang::CapturedStmt::Capture
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3770

clang::CapturedStmt
This captures a statement into a function.
Definition: Stmt.h:3757

clang::CapturedStmt::getCapturedDecl
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1407

clang::CapturedStmt::capturesVariable
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1431

clang::CapturedStmt::captures
capture_range captures()
Definition: Stmt.h:3895

clang::CapturedStmt::getCapturedStmt
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3861

clang::CastExpr::getCastKind
CastKind getCastKind() const
Definition: Expr.h:3579

clang::CastExpr::getSubExpr
Expr * getSubExpr()
Definition: Expr.h:3585

clang::CharUnits
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38

clang::CharUnits::isZero
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122

clang::CharUnits::getAsAlign
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189

clang::CharUnits::getQuantity
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185

clang::CharUnits::fromQuantity
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63

clang::CodeGen::Address
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111

clang::CodeGen::Address::invalid
static Address invalid()
Definition: Address.h:153

clang::CodeGen::Address::emitRawPointer
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220

clang::CodeGen::Address::getAlignment
CharUnits getAlignment() const
Definition: Address.h:166

clang::CodeGen::Address::getElementType
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:184

clang::CodeGen::Address::withElementType
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:241

clang::CodeGen::Address::getType
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176

clang::CodeGen::AggValueSlot::DoesNotOverlap
@ DoesNotOverlap
Definition: CGValue.h:571

clang::CodeGen::ApplyDebugLocation::CreateEmpty
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:886

clang::CodeGen::CGBuilderTy
Definition: CGBuilder.h:50

clang::CodeGen::CGBuilderTy::CreateGEP
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:292

clang::CodeGen::CGBuilderTy::CreatePointerBitCastOrAddrSpaceCast
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:203

clang::CodeGen::CGBuilderTy::CreateStore
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:136

clang::CodeGen::CGBuilderTy::CreateConstArrayGEP
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:241

clang::CodeGen::CGBuilderTy::CreateIsNull
llvm::Value * CreateIsNull(Address Addr, const Twine &Name="")
Definition: CGBuilder.h:355

clang::CodeGen::CGBuilderTy::CreateConstGEP
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:278

clang::CodeGen::CGBuilderTy::CreateConstInBoundsGEP
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:261

clang::CodeGen::CGBuilderTy::CreateInBoundsGEP
Address CreateInBoundsGEP(Address Addr, ArrayRef< llvm::Value * > IdxList, llvm::Type *ElementType, CharUnits Align, const Twine &Name="")
Definition: CGBuilder.h:345

clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:554

clang::CodeGen::CGOpenMPRuntimeGPU
Definition: CGOpenMPRuntimeGPU.h:24

clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelCall
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
Definition: CGOpenMPRuntimeGPU.cpp:1230

clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
Definition: CGOpenMPRuntimeGPU.cpp:996

clang::CodeGen::CGOpenMPRuntimeGPU::emitProcBindClause
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
Definition: CGOpenMPRuntimeGPU.cpp:921

clang::CodeGen::CGOpenMPRuntimeGPU::emitReduction
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
Definition: CGOpenMPRuntimeGPU.cpp:2807

clang::CodeGen::CGOpenMPRuntimeGPU::DataSharingMode
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
Definition: CGOpenMPRuntimeGPU.h:40

clang::CodeGen::CGOpenMPRuntimeGPU::DS_CUDA
@ DS_CUDA
CUDA data sharing mode.
Definition: CGOpenMPRuntimeGPU.h:42

clang::CodeGen::CGOpenMPRuntimeGPU::DS_Generic
@ DS_Generic
Generic data-sharing mode.
Definition: CGOpenMPRuntimeGPU.h:44

clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultDistScheduleAndChunk
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3332

clang::CodeGen::CGOpenMPRuntimeGPU::getAddressOfLocalVariable
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
Definition: CGOpenMPRuntimeGPU.cpp:3258

clang::CodeGen::CGOpenMPRuntimeGPU::emitFunctionProlog
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
Definition: CGOpenMPRuntimeGPU.cpp:3191

clang::CodeGen::CGOpenMPRuntimeGPU::getDefaultScheduleAndChunk
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
Definition: CGOpenMPRuntimeGPU.cpp:3349

clang::CodeGen::CGOpenMPRuntimeGPU::emitNumTeamsClause
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
Definition: CGOpenMPRuntimeGPU.cpp:933

clang::CodeGen::CGOpenMPRuntimeGPU::emitCriticalRegion
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
Definition: CGOpenMPRuntimeGPU.cpp:1342

clang::CodeGen::CGOpenMPRuntimeGPU::emitTeamsCall
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
Definition: CGOpenMPRuntimeGPU.cpp:1205

clang::CodeGen::CGOpenMPRuntimeGPU::hasAllocateAttributeForGlobalVar
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
Definition: CGOpenMPRuntimeGPU.cpp:3415

clang::CodeGen::CGOpenMPRuntimeGPU::getKmpcFreeShared
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
Definition: CGOpenMPRuntimeGPU.cpp:1168

clang::CodeGen::CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU
CGOpenMPRuntimeGPU(CodeGenModule &CGM)
Definition: CGOpenMPRuntimeGPU.cpp:890

clang::CodeGen::CGOpenMPRuntimeGPU::emitParallelOutlinedFunction
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
Definition: CGOpenMPRuntimeGPU.cpp:938

clang::CodeGen::CGOpenMPRuntimeGPU::functionFinished
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntimeGPU.cpp:3327

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUThreadID
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3563

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUWarpSize
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
Definition: CGOpenMPRuntimeGPU.cpp:3571

clang::CodeGen::CGOpenMPRuntimeGPU::processRequiresDirective
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntimeGPU.cpp:3460

clang::CodeGen::CGOpenMPRuntimeGPU::emitOutlinedFunctionCall
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntimeGPU.cpp:3037

clang::CodeGen::CGOpenMPRuntimeGPU::isDelayedVariableLengthDecl
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Definition: CGOpenMPRuntimeGPU.cpp:1131

clang::CodeGen::CGOpenMPRuntimeGPU::getParameterAddress
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
Definition: CGOpenMPRuntimeGPU.cpp:3011

clang::CodeGen::CGOpenMPRuntimeGPU::ExecutionMode
ExecutionMode
Defines the execution mode.
Definition: CGOpenMPRuntimeGPU.h:27

clang::CodeGen::CGOpenMPRuntimeGPU::EM_Unknown
@ EM_Unknown
Unknown execution mode (orphaned directive).
Definition: CGOpenMPRuntimeGPU.h:33

clang::CodeGen::CGOpenMPRuntimeGPU::EM_SPMD
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
Definition: CGOpenMPRuntimeGPU.h:29

clang::CodeGen::CGOpenMPRuntimeGPU::emitBarrierCall
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntimeGPU.cpp:1325

clang::CodeGen::CGOpenMPRuntimeGPU::getGPUNumThreads
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
Definition: CGOpenMPRuntimeGPU.cpp:3550

clang::CodeGen::CGOpenMPRuntimeGPU::translateParameter
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
Definition: CGOpenMPRuntimeGPU.cpp:2979

clang::CodeGen::CGOpenMPRuntimeGPU::getKmpcAllocShared
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
Definition: CGOpenMPRuntimeGPU.cpp:1142

clang::CodeGen::CGOpenMPRuntimeGPU::isGPU
bool isGPU() const override
Returns true if the current target is a GPU.
Definition: CGOpenMPRuntimeGPU.h:134

clang::CodeGen::CGOpenMPRuntimeGPU::emitNumThreadsClause
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
Definition: CGOpenMPRuntimeGPU.cpp:927

clang::CodeGen::CGOpenMPRuntimeGPU::adjustTargetSpecificDataForLambdas
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
Definition: CGOpenMPRuntimeGPU.cpp:3361

clang::CodeGen::CGOpenMPRuntime
Definition: CGOpenMPRuntime.h:229

clang::CodeGen::CGOpenMPRuntime::emitThreadIDAddress
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
Definition: CGOpenMPRuntime.cpp:1944

clang::CodeGen::CGOpenMPRuntime::CGM
CodeGenModule & CGM
Definition: CGOpenMPRuntime.h:309

clang::CodeGen::CGOpenMPRuntime::getSingleCompoundChild
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
Definition: CGOpenMPRuntime.cpp:5966

clang::CodeGen::CGOpenMPRuntime::emitUpdateLocation
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
Definition: CGOpenMPRuntime.cpp:1366

clang::CodeGen::CGOpenMPRuntime::getOMPBuilder
llvm::OpenMPIRBuilder & getOMPBuilder()
Definition: CGOpenMPRuntime.h:306

clang::CodeGen::CGOpenMPRuntime::functionFinished
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
Definition: CGOpenMPRuntime.cpp:1461

clang::CodeGen::CGOpenMPRuntime::emitTeamsOutlinedFunction
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
Definition: CGOpenMPRuntime.cpp:1271

clang::CodeGen::CGOpenMPRuntime::OMPBuilder
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
Definition: CGOpenMPRuntime.h:312

clang::CodeGen::CGOpenMPRuntime::hasRequiresUnifiedSharedMemory
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
Definition: CGOpenMPRuntime.cpp:10093

clang::CodeGen::CGOpenMPRuntime::processRequiresDirective
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
Definition: CGOpenMPRuntime.cpp:10040

clang::CodeGen::CGOpenMPRuntime::getThreadID
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
Definition: CGOpenMPRuntime.cpp:1391

clang::CodeGen::CGOpenMPRuntime::clearLocThreadIdInsertPt
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
Definition: CGOpenMPRuntime.cpp:1344

clang::CodeGen::CGOpenMPRuntime::emitBarrierCall
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
Definition: CGOpenMPRuntime.cpp:2343

clang::CodeGen::CGOpenMPRuntime::getDefaultFlagsForBarriers
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
Definition: CGOpenMPRuntime.cpp:2310

clang::CodeGen::CGOpenMPRuntime::emitParallelOutlinedFunction
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
Definition: CGOpenMPRuntime.cpp:1261

clang::CodeGen::CGOpenMPRuntime::getDefaultDistScheduleAndChunk
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
Definition: CGOpenMPRuntime.h:1523

clang::CodeGen::CGOpenMPRuntime::getIdentTyPointerTy
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
Definition: CGOpenMPRuntime.cpp:1482

clang::CodeGen::CGOpenMPRuntime::emitSingleReductionCombiner
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
Definition: CGOpenMPRuntime.cpp:4895

clang::CodeGen::CGOpenMPRuntime::emitCriticalRegion
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
Definition: CGOpenMPRuntime.cpp:2006

clang::CodeGen::CGOpenMPRuntime::emitReduction
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
Definition: CGOpenMPRuntime.cpp:4915

clang::CodeGen::CGOpenMPRuntime::emitOutlinedFunctionCall
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
Definition: CGOpenMPRuntime.cpp:11236

clang::CodeGen::CGOpenMPRuntime::emitReductionFunction
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
Definition: CGOpenMPRuntime.cpp:4805

clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::getKind
CapturedRegionKind getKind() const
Definition: CodeGenFunction.h:487

clang::CodeGen::CodeGenFunction::CGCapturedStmtInfo::isCXXThisExprCaptured
bool isCXXThisExprCaptured() const
Definition: CodeGenFunction.h:498

clang::CodeGen::CodeGenFunction::OMPPrivateScope
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
Definition: CodeGenFunction.h:1164

clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:258

clang::CodeGen::CodeGenFunction::EmitLoadOfReferenceLValue
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2788

clang::CodeGen::CodeGenFunction::EmitLValueForField
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4833

clang::CodeGen::CodeGenFunction::ConvertType
llvm::Type * ConvertType(QualType T)
Definition: CodeGenFunction.cpp:234

clang::CodeGen::CodeGenFunction::CapturedStmtInfo
CGCapturedStmtInfo * CapturedStmtInfo
Definition: CodeGenFunction.h:533

clang::CodeGen::CodeGenFunction::LoadCXXThis
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
Definition: CodeGenFunction.h:3076

clang::CodeGen::CodeGenFunction::EmitLoadOfComplex
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
Definition: CGExprComplex.cpp:1497

clang::CodeGen::CodeGenFunction::createBasicBlock
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
Definition: CodeGenFunction.h:2617

clang::CodeGen::CodeGenFunction::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenFunction.h:2203

clang::CodeGen::CodeGenFunction::EmitLoadOfPointer
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
Definition: CGExpr.cpp:2797

clang::CodeGen::CodeGenFunction::MakeNaturalAlignPointeeRawAddrLValue
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
Definition: CodeGenFunction.cpp:225

clang::CodeGen::CodeGenFunction::EmitAggregateCopy
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:2093

clang::CodeGen::CodeGenFunction::CreateDefaultAlignTempAlloca
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:135

clang::CodeGen::CodeGenFunction::getContext
ASTContext & getContext() const
Definition: CodeGenFunction.h:2160

clang::CodeGen::CodeGenFunction::StartFunction
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
Definition: CodeGenFunction.cpp:940

clang::CodeGen::CodeGenFunction::EvaluateExprAsBool
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
Definition: CGExpr.cpp:184

clang::CodeGen::CodeGenFunction::HaveInsertPoint
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
Definition: CodeGenFunction.h:2658

clang::CodeGen::CodeGenFunction::getTypeSize
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
Definition: CGStmtOpenMP.cpp:327

clang::CodeGen::CodeGenFunction::EmitLValueForFieldInitialization
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition: CGExpr.cpp:5014

clang::CodeGen::CodeGenFunction::EmitRuntimeCall
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")

clang::CodeGen::CodeGenFunction::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenFunction.h:2202

clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:296

clang::CodeGen::CodeGenFunction::EHStack
EHScopeStack EHStack
Definition: CodeGenFunction.h:655

clang::CodeGen::CodeGenFunction::getVLASize
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
Definition: CodeGenFunction.cpp:2577

clang::CodeGen::CodeGenFunction::EmitLoadOfScalar
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:4185

clang::CodeGen::CodeGenFunction::EmitStoreOfComplex
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
Definition: CGExprComplex.cpp:1491

clang::CodeGen::CodeGenFunction::getDebugInfo
CGDebugInfo * getDebugInfo()
Definition: CodeGenFunction.h:2161

clang::CodeGen::CodeGenFunction::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T)
Definition: CodeGenFunction.cpp:230

clang::CodeGen::CodeGenFunction::getEvaluationKind
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
Definition: CodeGenFunction.cpp:238

clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:288

clang::CodeGen::CodeGenFunction::EmitBranch
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:598

clang::CodeGen::CodeGenFunction::CreateMemTemp
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:147

clang::CodeGen::CodeGenFunction::CurFn
llvm::Function * CurFn
Definition: CodeGenFunction.h:358

clang::CodeGen::CodeGenFunction::MakeAddrLValue
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
Definition: CodeGenFunction.h:2709

clang::CodeGen::CodeGenFunction::FinishFunction
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
Definition: CodeGenFunction.cpp:350

clang::CodeGen::CodeGenFunction::GetAddrOfLocalVar
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
Definition: CodeGenFunction.h:2992

clang::CodeGen::CodeGenFunction::ComplexPairTy
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
Definition: CodeGenFunction.h:294

clang::CodeGen::CodeGenFunction::EmitLValue
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
Definition: CGExpr.cpp:1503

clang::CodeGen::CodeGenFunction::getTypes
CodeGenTypes & getTypes() const
Definition: CodeGenFunction.h:2159

clang::CodeGen::CodeGenFunction::EmitScalarConversion
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
Definition: CGExprScalar.cpp:5515

clang::CodeGen::CodeGenFunction::EmitStoreOfScalar
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
Definition: CodeGenFunction.h:4207

clang::CodeGen::CodeGenFunction::EmitBlock
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:578

clang::CodeGen::CodeGenFunction::MakeNaturalAlignRawAddrLValue
LValue MakeNaturalAlignRawAddrLValue(llvm::Value *V, QualType T)
Definition: CodeGenFunction.cpp:220

clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:281

clang::CodeGen::CodeGenModule::getOpenMPRuntime
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
Definition: CodeGenModule.h:662

clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:776

clang::CodeGen::CodeGenModule::SetInternalFunctionAttributes
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
Definition: CodeGenModule.cpp:2916

clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:788

clang::CodeGen::CodeGenModule::addCompilerUsedGlobal
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
Definition: CodeGenModule.cpp:3145

clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:771

clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:762

clang::CodeGen::CodeGenModule::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenModule.h:782

clang::CodeGen::CodeGenModule::Error
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
Definition: CodeGenModule.cpp:1628

clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:761

clang::CodeGen::CodeGenTypes::GetFunctionType
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1641

clang::CodeGen::CodeGenTypes::arrangeBuiltinFunctionDeclaration
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:682

clang::CodeGen::CodeGenTypes::getTargetAddressSpace
unsigned getTargetAddressSpace(QualType T) const
Definition: CodeGenTypes.cpp:985

clang::CodeGen::CodeGenTypes::ConvertTypeForMem
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Definition: CodeGenTypes.cpp:92

clang::CodeGen::EHScopeStack::Cleanup
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141

clang::CodeGen::FunctionArgList
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:351

clang::CodeGen::LValueBaseInfo
Definition: CGValue.h:164

clang::CodeGen::LValue
LValue - This represents an lvalue references.
Definition: CGValue.h:181

clang::CodeGen::LValue::getAddress
Address getAddress() const
Definition: CGValue.h:370

clang::CodeGen::LValue::getPointer
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:361

clang::CodeGen::LValue::setAddress
void setAddress(Address address)
Definition: CGValue.h:372

clang::CodeGen::PrePostActionTy
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
Definition: CGOpenMPRuntime.h:58

clang::CodeGen::RawAddress
An abstract representation of an aligned address.
Definition: Address.h:41

clang::CodeGen::RawAddress::getPointer
llvm::Value * getPointer() const
Definition: Address.h:65

clang::CodeGen::RegionCodeGenTy
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
Definition: CGOpenMPRuntime.h:68

clang::CodeGen::RegionCodeGenTy::setAction
void setAction(PrePostActionTy &Action) const
Definition: CGOpenMPRuntime.h:89

clang::ConstStmtVisitor
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195

clang::DeclContext
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436

clang::DeclContext::addDecl
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1716

clang::DeclRefExpr
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1260

clang::DeclRefExpr::getDecl
ValueDecl * getDecl()
Definition: Expr.h:1328

clang::DeclStmt
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1497

clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86

clang::Decl::hasAttrs
bool hasAttrs() const
Definition: DeclBase.h:524

clang::Decl::attr_end
attr_iterator attr_end() const
Definition: DeclBase.h:548

clang::Decl::getAttrs
AttrVec & getAttrs()
Definition: DeclBase.h:530

clang::Decl::isCanonicalDecl
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:974

clang::Decl::attr_begin
attr_iterator attr_begin() const
Definition: DeclBase.h:545

clang::Decl::getCanonicalDecl
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:968

clang::Decl::getLocation
SourceLocation getLocation() const
Definition: DeclBase.h:445

clang::Decl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:437

clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:583

clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:579

clang::Decl::getDeclContext
DeclContext * getDeclContext()
Definition: DeclBase.h:454

clang::DeclaratorDecl::getBeginLoc
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:823

clang::Expr
This represents one expression.
Definition: Expr.h:110

clang::Expr::IgnoreParenImpCasts
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3111

clang::Expr::IgnoreParens
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3107

clang::Expr::isLValue
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277

clang::Expr::getType
QualType getType() const
Definition: Expr.h:142

clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:3060

clang::FieldDecl::Create
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4549

clang::GlobalDecl
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56

clang::ImplicitCastExpr
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3707

clang::ImplicitParamDecl
Definition: Decl.h:1721

clang::ImplicitParamDecl::Create
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5383

clang::IntegerLiteral::Create
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:1032

clang::LambdaCapture
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25

clang::LambdaExpr
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1950

clang::LambdaExpr::isInitCapture
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1290

clang::LambdaExpr::captures
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1303

clang::LangOptions::OMPHostIRFile
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:560

clang::NamedDecl::getName
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276

clang::NamedDecl::getIdentifier
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270

clang::OMPClause
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55

clang::OMPExecutableDirective
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266

clang::OMPExecutableDirective::hasAssociatedStmt
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:531

clang::OMPExecutableDirective::getDirectiveKind
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:569

clang::OMPExecutableDirective::getInnermostCapturedStmt
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:556

clang::OMPExecutableDirective::getBeginLoc
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:502

clang::OMPExecutableDirective::clauses
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:586

clang::OMPExecutableDirective::getClausesOfKind
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:459

clang::OMPExecutableDirective::getCapturedStmt
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:547

clang::OMPExecutableDirective::getSingleClause
static const SpecificClause * getSingleClause(ArrayRef< OMPClause * > Clauses)
Gets a single clause of the specified kind associated with the current directive iff there is only on...
Definition: StmtOpenMP.h:477

clang::OMPExecutableDirective::getAssociatedStmt
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:534

clang::OMPLastprivateClause
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:2896

clang::OMPLoopDirective
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1018

clang::OMPReductionClause
This represents clause 'reduction' in the '#pragma omp ...' directives.
Definition: OpenMPClause.h:3207

clang::OMPRequiresDecl
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417

clang::OMPRequiresDecl::clauselists
clauselist_range clauselists()
Definition: DeclOpenMP.h:442

clang::OMPXBareClause
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
Definition: OpenMPClause.h:9367

clang::ParmVarDecl::Create
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2919

clang::PointerType
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3151

clang::QualType
A (possibly-)qualified type.
Definition: Type.h:940

clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7497

clang::QualType::getNonReferenceType
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:7572

clang::QualType::getCanonicalType
QualType getCanonicalType() const
Definition: Type.h:7423

clang::QualifierCollector
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7311

clang::QualifierCollector::strip
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7318

clang::QualifierCollector::apply
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4311

clang::Qualifiers::addRestrict
void addRestrict()
Definition: Type.h:466

clang::Qualifiers::addAddressSpace
void addAddressSpace(LangAS space, bool AllowDefaultAddrSpace=false)
Definition: Type.h:583

clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:4171

clang::RecordDecl::completeDefinition
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5085

clang::SourceLocation
Encodes a location in the source.
Definition: SourceLocation.h:88

clang::Stmt
Stmt - This represents one statement.
Definition: Stmt.h:84

clang::Stmt::children
child_range children()
Definition: Stmt.cpp:287

clang::Stmt::IgnoreContainers
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:197

clang::TagDecl::startDefinition
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4741

clang::TargetInfo::getNewAlign
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:742

clang::TargetInfo::getGridValue
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1638

clang::TargetInfo::hasFeature
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1472

clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:312

clang::TargetOptions::FeatureMap
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62

clang::Type
The base class of the type hierarchy.
Definition: Type.h:1813

clang::Type::getAsCXXRecordDecl
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1881

clang::Type::isIntegerType
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7979

clang::Type::castAs
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8227

clang::Type::isReferenceType
bool isReferenceType() const
Definition: Type.h:7636

clang::Type::getPointeeType
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:705

clang::Type::isLValueReferenceType
bool isLValueReferenceType() const
Definition: Type.h:7640

clang::Type::hasSignedIntegerRepresentation
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2185

clang::Type::isVariablyModifiedType
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2679

clang::UnaryOperator
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2235

clang::UnaryOperator::getOpcode
Opcode getOpcode() const
Definition: Expr.h:2275

clang::UnaryOperator::getSubExpr
Expr * getSubExpr() const
Definition: Expr.h:2280

clang::ValueDecl
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:707

clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:718

clang::ValueDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5375

clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:919

clang::VarDecl::getCanonicalDecl
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2258

clang::VarDecl::isInitCapture
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1559

clang::specific_attr_iterator
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33

llvm::ArrayRef
Definition: LLVM.h:31

llvm::SmallPtrSet
Definition: ASTContext.h:49

llvm::SmallString
Definition: LLVM.h:34

llvm::SmallVectorImpl
Definition: Randstruct.h:18

llvm::SmallVector
Definition: LLVM.h:35

Cuda.h

AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:65

clang::CodeGen::AlignmentSource::Type
@ Type
The l-value was considered opaque, so the alignment was determined from a type.

clang::CodeGen::AlignmentSource::Decl
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...

clang::CodeGen::TEK_Aggregate
@ TEK_Aggregate
Definition: CodeGenFunction.h:112

clang::CodeGen::TEK_Scalar
@ TEK_Scalar
Definition: CodeGenFunction.h:110

clang::CodeGen::TEK_Complex
@ TEK_Complex
Definition: CodeGenFunction.h:111

clang::CodeGen::NormalAndEHCleanup
@ NormalAndEHCleanup
Definition: EHScopeStack.h:86

clang::ObjCPropertyAttribute::Kind
Kind
Definition: DeclObjCCommon.h:22

clang::dependency_directives_scan::DirectiveKind
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
Definition: DependencyDirectivesScanner.h:59

clang::format::Base
Base
Definition: IntegerLiteralSeparatorFixer.cpp:20

clang::index::SymbolKind::Field
@ Field

clang::interp::APInt
llvm::APInt APInt
Definition: Integral.h:29

clang::prec::Conditional
@ Conditional
Definition: OperatorPrecedence.h:30

clang
The JSON file list parser is used to communicate input to InstallAPI.
Definition: CalledOnceCheck.h:17

clang::OpenACCClauseKind::Private
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...

clang::CudaArch
CudaArch
Definition: Cuda.h:54

clang::CudaArch::GFX940
@ GFX940

clang::CudaArch::GFX810
@ GFX810

clang::CudaArch::GFX600
@ GFX600

clang::CudaArch::GFX1031
@ GFX1031

clang::CudaArch::GFX701
@ GFX701

clang::CudaArch::GFX1035
@ GFX1035

clang::CudaArch::GFX904
@ GFX904

clang::CudaArch::GFX1012
@ GFX1012

clang::CudaArch::GFX702
@ GFX702

clang::CudaArch::GFX805
@ GFX805

clang::CudaArch::GFX602
@ GFX602

clang::CudaArch::SM_21
@ SM_21

clang::CudaArch::GFX1010
@ GFX1010

clang::CudaArch::SM_72
@ SM_72

clang::CudaArch::GFX1034
@ GFX1034

clang::CudaArch::GFX90a
@ GFX90a

clang::CudaArch::UNUSED
@ UNUSED

clang::CudaArch::SM_61
@ SM_61

clang::CudaArch::GFX1036
@ GFX1036

clang::CudaArch::GFX1201
@ GFX1201

clang::CudaArch::GFX1011
@ GFX1011

clang::CudaArch::GFX703
@ GFX703

clang::CudaArch::GFX941
@ GFX941

clang::CudaArch::UNKNOWN
@ UNKNOWN

clang::CudaArch::SM_52
@ SM_52

clang::CudaArch::SM_50
@ SM_50

clang::CudaArch::SM_75
@ SM_75

clang::CudaArch::SM_89
@ SM_89

clang::CudaArch::GFX1100
@ GFX1100

clang::CudaArch::SM_53
@ SM_53

clang::CudaArch::GFX1200
@ GFX1200

clang::CudaArch::GFX1032
@ GFX1032

clang::CudaArch::SM_35
@ SM_35

clang::CudaArch::SM_60
@ SM_60

clang::CudaArch::Generic
@ Generic

clang::CudaArch::GFX942
@ GFX942

clang::CudaArch::GFX1151
@ GFX1151

clang::CudaArch::SM_20
@ SM_20

clang::CudaArch::SM_86
@ SM_86

clang::CudaArch::GFX1103
@ GFX1103

clang::CudaArch::SM_87
@ SM_87

clang::CudaArch::GFX909
@ GFX909

clang::CudaArch::SM_62
@ SM_62

clang::CudaArch::GFX704
@ GFX704

clang::CudaArch::SM_80
@ SM_80

clang::CudaArch::GFX803
@ GFX803

clang::CudaArch::GFX902
@ GFX902

clang::CudaArch::GFX700
@ GFX700

clang::CudaArch::GFX1033
@ GFX1033

clang::CudaArch::GFX601
@ GFX601

clang::CudaArch::SM_90
@ SM_90

clang::CudaArch::SM_30
@ SM_30

clang::CudaArch::SM_70
@ SM_70

clang::CudaArch::GFX1150
@ GFX1150

clang::CudaArch::SM_90a
@ SM_90a

clang::CudaArch::GFX1013
@ GFX1013

clang::CudaArch::GFX900
@ GFX900

clang::CudaArch::GFX801
@ GFX801

clang::CudaArch::SM_37
@ SM_37

clang::CudaArch::GFX705
@ GFX705

clang::CudaArch::GFX1101
@ GFX1101

clang::CudaArch::GFX90c
@ GFX90c

clang::CudaArch::GFX906
@ GFX906

clang::CudaArch::LAST
@ LAST

clang::CudaArch::SM_32_
@ SM_32_

clang::CudaArch::GFX1030
@ GFX1030

clang::CudaArch::GFX802
@ GFX802

clang::CudaArch::GFX1102
@ GFX1102

clang::CudaArch::GFX908
@ GFX908

clang::OpenMPDirectiveKind
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24

clang::ICIS_NoInit
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:269

clang::isOpenMPDistributeDirective
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
Definition: OpenMPKinds.cpp:649

clang::LCK_ByRef
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37

clang::StringToCudaArch
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:169

clang::CR_OpenMP
@ CR_OpenMP
Definition: CapturedStmt.h:19

clang::isOpenMPParallelDirective
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
Definition: OpenMPKinds.cpp:602

clang::isOpenMPPrivate
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
Definition: OpenMPKinds.cpp:661

clang::SC_None
@ SC_None
Definition: Specifiers.h:247

clang::OpenMPDistScheduleClauseKind
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103

clang::isOpenMPTargetExecutionDirective
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
Definition: OpenMPKinds.cpp:609

clang::isOpenMPTeamsDirective
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
Definition: OpenMPKinds.cpp:626

clang::ArraySizeModifier::Normal
@ Normal

clang::TagTypeKind::Union
@ Union
The "union" keyword.

clang::isOpenMPLoopBoundSharingDirective
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
Definition: OpenMPKinds.cpp:676

clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25

clang::LangAS::opencl_global
@ opencl_global

clang::LangAS::cuda_shared
@ cuda_shared

clang::LangAS::cuda_constant
@ cuda_constant

clang::LangAS::Default
@ Default

clang::getOpenMPCaptureRegions
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
Definition: OpenMPKinds.cpp:705

clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86

clang::ImplicitParamKind::CXXThis
@ CXXThis
Parameter for C++ 'this' argument.

clang::ImplicitParamKind::Other
@ Other
Other implicit parameter.

clang::CudaArchToString
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:151

clang::OpenMPScheduleClauseKind
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30

clang::AS_public
@ AS_public
Definition: Specifiers.h:121

hlsl::uint64_t
unsigned long uint64_t
Definition: hlsl_basic_types.h:32

CopyOptionsTy
Definition: CGOpenMPRuntimeGPU.cpp:1569

CopyOptionsTy::ScratchpadIndex
llvm::Value * ScratchpadIndex
Definition: CGOpenMPRuntimeGPU.cpp:1571

CopyOptionsTy::ScratchpadWidth
llvm::Value * ScratchpadWidth
Definition: CGOpenMPRuntimeGPU.cpp:1572

CopyOptionsTy::RemoteLaneOffset
llvm::Value * RemoteLaneOffset
Definition: CGOpenMPRuntimeGPU.cpp:1570

clang::CodeGen::CGOpenMPRuntime::ReductionOptionsTy
Definition: CGOpenMPRuntime.h:1211

clang::CodeGen::CodeGenFunction::VlaSizePair::NumElts
llvm::Value * NumElts
Definition: CodeGenFunction.h:3056

clang::CodeGen::CodeGenTypeCache::VoidPtrTy
llvm::PointerType * VoidPtrTy
Definition: CodeGenTypeCache.h:57

clang::CodeGen::CodeGenTypeCache::Int64Ty
llvm::IntegerType * Int64Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::Int8Ty
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::SizeTy
llvm::IntegerType * SizeTy
Definition: CodeGenTypeCache.h:50

clang::CodeGen::CodeGenTypeCache::VoidPtrPtrTy
llvm::PointerType * VoidPtrPtrTy
Definition: CodeGenTypeCache.h:59

clang::CodeGen::CodeGenTypeCache::Int32Ty
llvm::IntegerType * Int32Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::IntTy
llvm::IntegerType * IntTy
int
Definition: CodeGenTypeCache.h:42

clang::CodeGen::CodeGenTypeCache::Int16Ty
llvm::IntegerType * Int16Ty
Definition: CodeGenTypeCache.h:37

clang::CodeGen::CodeGenTypeCache::Int8PtrTy
llvm::PointerType * Int8PtrTy
Definition: CodeGenTypeCache.h:58

clang::CodeGen::CodeGenTypeCache::getPointerAlign
CharUnits getPointerAlign() const
Definition: CodeGenTypeCache.h:123

clang::CodeGen::TBAAAccessInfo
Definition: CodeGenTBAA.h:42