clang  19.0.0git
CGOpenMPRuntimeGPU.cpp
Go to the documentation of this file.
1 //===---- CGOpenMPRuntimeGPU.cpp - Interface to OpenMP GPU Runtimes ----===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a generalized class for OpenMP runtime code generation
10 // specialized by GPU targets NVPTX and AMDGCN.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGOpenMPRuntimeGPU.h"
15 #include "CodeGenFunction.h"
16 #include "clang/AST/Attr.h"
17 #include "clang/AST/DeclOpenMP.h"
18 #include "clang/AST/OpenMPClause.h"
19 #include "clang/AST/StmtOpenMP.h"
20 #include "clang/AST/StmtVisitor.h"
21 #include "clang/Basic/Cuda.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/Frontend/OpenMP/OMPGridValues.h"
24 #include "llvm/Support/MathExtras.h"
25 
26 using namespace clang;
27 using namespace CodeGen;
28 using namespace llvm::omp;
29 
30 namespace {
31 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
32 class NVPTXActionTy final : public PrePostActionTy {
33  llvm::FunctionCallee EnterCallee = nullptr;
34  ArrayRef<llvm::Value *> EnterArgs;
35  llvm::FunctionCallee ExitCallee = nullptr;
36  ArrayRef<llvm::Value *> ExitArgs;
37  bool Conditional = false;
38  llvm::BasicBlock *ContBlock = nullptr;
39 
40 public:
41  NVPTXActionTy(llvm::FunctionCallee EnterCallee,
42  ArrayRef<llvm::Value *> EnterArgs,
43  llvm::FunctionCallee ExitCallee,
44  ArrayRef<llvm::Value *> ExitArgs, bool Conditional = false)
45  : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
46  ExitArgs(ExitArgs), Conditional(Conditional) {}
47  void Enter(CodeGenFunction &CGF) override {
48  llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
49  if (Conditional) {
50  llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
51  auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
52  ContBlock = CGF.createBasicBlock("omp_if.end");
53  // Generate the branch (If-stmt)
54  CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
55  CGF.EmitBlock(ThenBlock);
56  }
57  }
58  void Done(CodeGenFunction &CGF) {
59  // Emit the rest of blocks/branches
60  CGF.EmitBranch(ContBlock);
61  CGF.EmitBlock(ContBlock, true);
62  }
63  void Exit(CodeGenFunction &CGF) override {
64  CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
65  }
66 };
67 
68 /// A class to track the execution mode when codegening directives within
69 /// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
70 /// to the target region and used by containing directives such as 'parallel'
71 /// to emit optimized code.
72 class ExecutionRuntimeModesRAII {
73 private:
74  CGOpenMPRuntimeGPU::ExecutionMode SavedExecMode =
77 
78 public:
79  ExecutionRuntimeModesRAII(CGOpenMPRuntimeGPU::ExecutionMode &ExecMode,
81  : ExecMode(ExecMode) {
82  SavedExecMode = ExecMode;
83  ExecMode = EntryMode;
84  }
85  ~ExecutionRuntimeModesRAII() { ExecMode = SavedExecMode; }
86 };
87 
88 static const ValueDecl *getPrivateItem(const Expr *RefExpr) {
89  RefExpr = RefExpr->IgnoreParens();
90  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(RefExpr)) {
91  const Expr *Base = ASE->getBase()->IgnoreParenImpCasts();
92  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
93  Base = TempASE->getBase()->IgnoreParenImpCasts();
94  RefExpr = Base;
95  } else if (auto *OASE = dyn_cast<ArraySectionExpr>(RefExpr)) {
96  const Expr *Base = OASE->getBase()->IgnoreParenImpCasts();
97  while (const auto *TempOASE = dyn_cast<ArraySectionExpr>(Base))
98  Base = TempOASE->getBase()->IgnoreParenImpCasts();
99  while (const auto *TempASE = dyn_cast<ArraySubscriptExpr>(Base))
100  Base = TempASE->getBase()->IgnoreParenImpCasts();
101  RefExpr = Base;
102  }
103  RefExpr = RefExpr->IgnoreParenImpCasts();
104  if (const auto *DE = dyn_cast<DeclRefExpr>(RefExpr))
105  return cast<ValueDecl>(DE->getDecl()->getCanonicalDecl());
106  const auto *ME = cast<MemberExpr>(RefExpr);
107  return cast<ValueDecl>(ME->getMemberDecl()->getCanonicalDecl());
108 }
109 
110 static RecordDecl *buildRecordForGlobalizedVars(
111  ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
112  ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
113  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
114  &MappedDeclsFields,
115  int BufSize) {
116  using VarsDataTy = std::pair<CharUnits /*Align*/, const ValueDecl *>;
117  if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
118  return nullptr;
119  SmallVector<VarsDataTy, 4> GlobalizedVars;
120  for (const ValueDecl *D : EscapedDecls)
121  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
122  for (const ValueDecl *D : EscapedDeclsForTeams)
123  GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
124 
125  // Build struct _globalized_locals_ty {
126  // /* globalized vars */[WarSize] align (decl_align)
127  // /* globalized vars */ for EscapedDeclsForTeams
128  // };
129  RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
130  GlobalizedRD->startDefinition();
132  EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
133  for (const auto &Pair : GlobalizedVars) {
134  const ValueDecl *VD = Pair.second;
135  QualType Type = VD->getType();
137  Type = C.getPointerType(Type.getNonReferenceType());
138  else
139  Type = Type.getNonReferenceType();
141  FieldDecl *Field;
142  if (SingleEscaped.count(VD)) {
144  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
145  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
146  /*BW=*/nullptr, /*Mutable=*/false,
147  /*InitStyle=*/ICIS_NoInit);
148  Field->setAccess(AS_public);
149  if (VD->hasAttrs()) {
150  for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
151  E(VD->getAttrs().end());
152  I != E; ++I)
153  Field->addAttr(*I);
154  }
155  } else {
156  if (BufSize > 1) {
157  llvm::APInt ArraySize(32, BufSize);
158  Type = C.getConstantArrayType(Type, ArraySize, nullptr,
160  }
162  C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
163  C.getTrivialTypeSourceInfo(Type, SourceLocation()),
164  /*BW=*/nullptr, /*Mutable=*/false,
165  /*InitStyle=*/ICIS_NoInit);
166  Field->setAccess(AS_public);
167  llvm::APInt Align(32, Pair.first.getQuantity());
168  Field->addAttr(AlignedAttr::CreateImplicit(
169  C, /*IsAlignmentExpr=*/true,
170  IntegerLiteral::Create(C, Align,
171  C.getIntTypeForBitwidth(32, /*Signed=*/0),
172  SourceLocation()),
173  {}, AlignedAttr::GNU_aligned));
174  }
175  GlobalizedRD->addDecl(Field);
176  MappedDeclsFields.try_emplace(VD, Field);
177  }
178  GlobalizedRD->completeDefinition();
179  return GlobalizedRD;
180 }
181 
182 /// Get the list of variables that can escape their declaration context.
183 class CheckVarsEscapingDeclContext final
184  : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
185  CodeGenFunction &CGF;
186  llvm::SetVector<const ValueDecl *> EscapedDecls;
187  llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
188  llvm::SetVector<const ValueDecl *> DelayedVariableLengthDecls;
189  llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
190  RecordDecl *GlobalizedRD = nullptr;
191  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
192  bool AllEscaped = false;
193  bool IsForCombinedParallelRegion = false;
194 
195  void markAsEscaped(const ValueDecl *VD) {
196  // Do not globalize declare target variables.
197  if (!isa<VarDecl>(VD) ||
198  OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
199  return;
200  VD = cast<ValueDecl>(VD->getCanonicalDecl());
201  // Use user-specified allocation.
202  if (VD->hasAttrs() && VD->hasAttr<OMPAllocateDeclAttr>())
203  return;
204  // Variables captured by value must be globalized.
205  bool IsCaptured = false;
206  if (auto *CSI = CGF.CapturedStmtInfo) {
207  if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
208  // Check if need to capture the variable that was already captured by
209  // value in the outer region.
210  IsCaptured = true;
211  if (!IsForCombinedParallelRegion) {
212  if (!FD->hasAttrs())
213  return;
214  const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
215  if (!Attr)
216  return;
217  if (((Attr->getCaptureKind() != OMPC_map) &&
218  !isOpenMPPrivate(Attr->getCaptureKind())) ||
219  ((Attr->getCaptureKind() == OMPC_map) &&
220  !FD->getType()->isAnyPointerType()))
221  return;
222  }
223  if (!FD->getType()->isReferenceType()) {
224  assert(!VD->getType()->isVariablyModifiedType() &&
225  "Parameter captured by value with variably modified type");
226  EscapedParameters.insert(VD);
227  } else if (!IsForCombinedParallelRegion) {
228  return;
229  }
230  }
231  }
232  if ((!CGF.CapturedStmtInfo ||
233  (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
234  VD->getType()->isReferenceType())
235  // Do not globalize variables with reference type.
236  return;
237  if (VD->getType()->isVariablyModifiedType()) {
238  // If not captured at the target region level then mark the escaped
239  // variable as delayed.
240  if (IsCaptured)
241  EscapedVariableLengthDecls.insert(VD);
242  else
243  DelayedVariableLengthDecls.insert(VD);
244  } else
245  EscapedDecls.insert(VD);
246  }
247 
248  void VisitValueDecl(const ValueDecl *VD) {
249  if (VD->getType()->isLValueReferenceType())
250  markAsEscaped(VD);
251  if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
252  if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
253  const bool SavedAllEscaped = AllEscaped;
254  AllEscaped = VD->getType()->isLValueReferenceType();
255  Visit(VarD->getInit());
256  AllEscaped = SavedAllEscaped;
257  }
258  }
259  }
260  void VisitOpenMPCapturedStmt(const CapturedStmt *S,
261  ArrayRef<OMPClause *> Clauses,
262  bool IsCombinedParallelRegion) {
263  if (!S)
264  return;
265  for (const CapturedStmt::Capture &C : S->captures()) {
266  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
267  const ValueDecl *VD = C.getCapturedVar();
268  bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
269  if (IsCombinedParallelRegion) {
270  // Check if the variable is privatized in the combined construct and
271  // those private copies must be shared in the inner parallel
272  // directive.
273  IsForCombinedParallelRegion = false;
274  for (const OMPClause *C : Clauses) {
275  if (!isOpenMPPrivate(C->getClauseKind()) ||
276  C->getClauseKind() == OMPC_reduction ||
277  C->getClauseKind() == OMPC_linear ||
278  C->getClauseKind() == OMPC_private)
279  continue;
281  if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
282  Vars = PC->getVarRefs();
283  else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
284  Vars = PC->getVarRefs();
285  else
286  llvm_unreachable("Unexpected clause.");
287  for (const auto *E : Vars) {
288  const Decl *D =
289  cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
290  if (D == VD->getCanonicalDecl()) {
291  IsForCombinedParallelRegion = true;
292  break;
293  }
294  }
295  if (IsForCombinedParallelRegion)
296  break;
297  }
298  }
299  markAsEscaped(VD);
300  if (isa<OMPCapturedExprDecl>(VD))
301  VisitValueDecl(VD);
302  IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
303  }
304  }
305  }
306 
307  void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
308  assert(!GlobalizedRD &&
309  "Record for globalized variables is built already.");
310  ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
311  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
312  if (IsInTTDRegion)
313  EscapedDeclsForTeams = EscapedDecls.getArrayRef();
314  else
315  EscapedDeclsForParallel = EscapedDecls.getArrayRef();
316  GlobalizedRD = ::buildRecordForGlobalizedVars(
317  CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
318  MappedDeclsFields, WarpSize);
319  }
320 
321 public:
322  CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
323  ArrayRef<const ValueDecl *> TeamsReductions)
324  : CGF(CGF), EscapedDecls(TeamsReductions.begin(), TeamsReductions.end()) {
325  }
326  virtual ~CheckVarsEscapingDeclContext() = default;
327  void VisitDeclStmt(const DeclStmt *S) {
328  if (!S)
329  return;
330  for (const Decl *D : S->decls())
331  if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
332  VisitValueDecl(VD);
333  }
334  void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
335  if (!D)
336  return;
337  if (!D->hasAssociatedStmt())
338  return;
339  if (const auto *S =
340  dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
341  // Do not analyze directives that do not actually require capturing,
342  // like `omp for` or `omp simd` directives.
344  getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
345  if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
346  VisitStmt(S->getCapturedStmt());
347  return;
348  }
349  VisitOpenMPCapturedStmt(
350  S, D->clauses(),
351  CaptureRegions.back() == OMPD_parallel &&
353  }
354  }
355  void VisitCapturedStmt(const CapturedStmt *S) {
356  if (!S)
357  return;
358  for (const CapturedStmt::Capture &C : S->captures()) {
359  if (C.capturesVariable() && !C.capturesVariableByCopy()) {
360  const ValueDecl *VD = C.getCapturedVar();
361  markAsEscaped(VD);
362  if (isa<OMPCapturedExprDecl>(VD))
363  VisitValueDecl(VD);
364  }
365  }
366  }
367  void VisitLambdaExpr(const LambdaExpr *E) {
368  if (!E)
369  return;
370  for (const LambdaCapture &C : E->captures()) {
371  if (C.capturesVariable()) {
372  if (C.getCaptureKind() == LCK_ByRef) {
373  const ValueDecl *VD = C.getCapturedVar();
374  markAsEscaped(VD);
375  if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
376  VisitValueDecl(VD);
377  }
378  }
379  }
380  }
381  void VisitBlockExpr(const BlockExpr *E) {
382  if (!E)
383  return;
384  for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
385  if (C.isByRef()) {
386  const VarDecl *VD = C.getVariable();
387  markAsEscaped(VD);
388  if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
389  VisitValueDecl(VD);
390  }
391  }
392  }
393  void VisitCallExpr(const CallExpr *E) {
394  if (!E)
395  return;
396  for (const Expr *Arg : E->arguments()) {
397  if (!Arg)
398  continue;
399  if (Arg->isLValue()) {
400  const bool SavedAllEscaped = AllEscaped;
401  AllEscaped = true;
402  Visit(Arg);
403  AllEscaped = SavedAllEscaped;
404  } else {
405  Visit(Arg);
406  }
407  }
408  Visit(E->getCallee());
409  }
410  void VisitDeclRefExpr(const DeclRefExpr *E) {
411  if (!E)
412  return;
413  const ValueDecl *VD = E->getDecl();
414  if (AllEscaped)
415  markAsEscaped(VD);
416  if (isa<OMPCapturedExprDecl>(VD))
417  VisitValueDecl(VD);
418  else if (VD->isInitCapture())
419  VisitValueDecl(VD);
420  }
421  void VisitUnaryOperator(const UnaryOperator *E) {
422  if (!E)
423  return;
424  if (E->getOpcode() == UO_AddrOf) {
425  const bool SavedAllEscaped = AllEscaped;
426  AllEscaped = true;
427  Visit(E->getSubExpr());
428  AllEscaped = SavedAllEscaped;
429  } else {
430  Visit(E->getSubExpr());
431  }
432  }
433  void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
434  if (!E)
435  return;
436  if (E->getCastKind() == CK_ArrayToPointerDecay) {
437  const bool SavedAllEscaped = AllEscaped;
438  AllEscaped = true;
439  Visit(E->getSubExpr());
440  AllEscaped = SavedAllEscaped;
441  } else {
442  Visit(E->getSubExpr());
443  }
444  }
445  void VisitExpr(const Expr *E) {
446  if (!E)
447  return;
448  bool SavedAllEscaped = AllEscaped;
449  if (!E->isLValue())
450  AllEscaped = false;
451  for (const Stmt *Child : E->children())
452  if (Child)
453  Visit(Child);
454  AllEscaped = SavedAllEscaped;
455  }
456  void VisitStmt(const Stmt *S) {
457  if (!S)
458  return;
459  for (const Stmt *Child : S->children())
460  if (Child)
461  Visit(Child);
462  }
463 
464  /// Returns the record that handles all the escaped local variables and used
465  /// instead of their original storage.
466  const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
467  if (!GlobalizedRD)
468  buildRecordForGlobalizedVars(IsInTTDRegion);
469  return GlobalizedRD;
470  }
471 
472  /// Returns the field in the globalized record for the escaped variable.
473  const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
474  assert(GlobalizedRD &&
475  "Record for globalized variables must be generated already.");
476  return MappedDeclsFields.lookup(VD);
477  }
478 
479  /// Returns the list of the escaped local variables/parameters.
480  ArrayRef<const ValueDecl *> getEscapedDecls() const {
481  return EscapedDecls.getArrayRef();
482  }
483 
484  /// Checks if the escaped local variable is actually a parameter passed by
485  /// value.
486  const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
487  return EscapedParameters;
488  }
489 
490  /// Returns the list of the escaped variables with the variably modified
491  /// types.
492  ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
493  return EscapedVariableLengthDecls.getArrayRef();
494  }
495 
496  /// Returns the list of the delayed variables with the variably modified
497  /// types.
498  ArrayRef<const ValueDecl *> getDelayedVariableLengthDecls() const {
499  return DelayedVariableLengthDecls.getArrayRef();
500  }
501 };
502 } // anonymous namespace
503 
504 /// Get the id of the warp in the block.
505 /// We assume that the warp size is 32, which is always the case
506 /// on the NVPTX device, to generate more efficient code.
507 static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
508  CGBuilderTy &Bld = CGF.Builder;
509  unsigned LaneIDBits =
510  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
511  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
512  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
513 }
514 
515 /// Get the id of the current lane in the Warp.
516 /// We assume that the warp size is 32, which is always the case
517 /// on the NVPTX device, to generate more efficient code.
518 static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
519  CGBuilderTy &Bld = CGF.Builder;
520  unsigned LaneIDBits =
521  llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
522  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
523  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
524  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
525  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
526  "nvptx_lane_id");
527 }
528 
530 CGOpenMPRuntimeGPU::getExecutionMode() const {
531  return CurrentExecutionMode;
532 }
533 
535 CGOpenMPRuntimeGPU::getDataSharingMode() const {
536  return CurrentDataSharingMode;
537 }
538 
539 /// Check for inner (nested) SPMD construct, if any
541  const OMPExecutableDirective &D) {
542  const auto *CS = D.getInnermostCapturedStmt();
543  const auto *Body =
544  CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
545  const Stmt *ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
546 
547  if (const auto *NestedDir =
548  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
549  OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
550  switch (D.getDirectiveKind()) {
551  case OMPD_target:
552  if (isOpenMPParallelDirective(DKind))
553  return true;
554  if (DKind == OMPD_teams) {
555  Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
556  /*IgnoreCaptured=*/true);
557  if (!Body)
558  return false;
559  ChildStmt = CGOpenMPRuntime::getSingleCompoundChild(Ctx, Body);
560  if (const auto *NND =
561  dyn_cast_or_null<OMPExecutableDirective>(ChildStmt)) {
562  DKind = NND->getDirectiveKind();
563  if (isOpenMPParallelDirective(DKind))
564  return true;
565  }
566  }
567  return false;
568  case OMPD_target_teams:
569  return isOpenMPParallelDirective(DKind);
570  case OMPD_target_simd:
571  case OMPD_target_parallel:
572  case OMPD_target_parallel_for:
573  case OMPD_target_parallel_for_simd:
574  case OMPD_target_teams_distribute:
575  case OMPD_target_teams_distribute_simd:
576  case OMPD_target_teams_distribute_parallel_for:
577  case OMPD_target_teams_distribute_parallel_for_simd:
578  case OMPD_parallel:
579  case OMPD_for:
580  case OMPD_parallel_for:
581  case OMPD_parallel_master:
582  case OMPD_parallel_sections:
583  case OMPD_for_simd:
584  case OMPD_parallel_for_simd:
585  case OMPD_cancel:
586  case OMPD_cancellation_point:
587  case OMPD_ordered:
588  case OMPD_threadprivate:
589  case OMPD_allocate:
590  case OMPD_task:
591  case OMPD_simd:
592  case OMPD_sections:
593  case OMPD_section:
594  case OMPD_single:
595  case OMPD_master:
596  case OMPD_critical:
597  case OMPD_taskyield:
598  case OMPD_barrier:
599  case OMPD_taskwait:
600  case OMPD_taskgroup:
601  case OMPD_atomic:
602  case OMPD_flush:
603  case OMPD_depobj:
604  case OMPD_scan:
605  case OMPD_teams:
606  case OMPD_target_data:
607  case OMPD_target_exit_data:
608  case OMPD_target_enter_data:
609  case OMPD_distribute:
610  case OMPD_distribute_simd:
611  case OMPD_distribute_parallel_for:
612  case OMPD_distribute_parallel_for_simd:
613  case OMPD_teams_distribute:
614  case OMPD_teams_distribute_simd:
615  case OMPD_teams_distribute_parallel_for:
616  case OMPD_teams_distribute_parallel_for_simd:
617  case OMPD_target_update:
618  case OMPD_declare_simd:
619  case OMPD_declare_variant:
620  case OMPD_begin_declare_variant:
621  case OMPD_end_declare_variant:
622  case OMPD_declare_target:
623  case OMPD_end_declare_target:
624  case OMPD_declare_reduction:
625  case OMPD_declare_mapper:
626  case OMPD_taskloop:
627  case OMPD_taskloop_simd:
628  case OMPD_master_taskloop:
629  case OMPD_master_taskloop_simd:
630  case OMPD_parallel_master_taskloop:
631  case OMPD_parallel_master_taskloop_simd:
632  case OMPD_requires:
633  case OMPD_unknown:
634  default:
635  llvm_unreachable("Unexpected directive.");
636  }
637  }
638 
639  return false;
640 }
641 
643  const OMPExecutableDirective &D) {
645  switch (DirectiveKind) {
646  case OMPD_target:
647  case OMPD_target_teams:
648  return hasNestedSPMDDirective(Ctx, D);
649  case OMPD_target_parallel_loop:
650  case OMPD_target_parallel:
651  case OMPD_target_parallel_for:
652  case OMPD_target_parallel_for_simd:
653  case OMPD_target_teams_distribute_parallel_for:
654  case OMPD_target_teams_distribute_parallel_for_simd:
655  case OMPD_target_simd:
656  case OMPD_target_teams_distribute_simd:
657  return true;
658  case OMPD_target_teams_distribute:
659  return false;
660  case OMPD_target_teams_loop:
661  // Whether this is true or not depends on how the directive will
662  // eventually be emitted.
663  if (auto *TTLD = dyn_cast<OMPTargetTeamsGenericLoopDirective>(&D))
664  return TTLD->canBeParallelFor();
665  return false;
666  case OMPD_parallel:
667  case OMPD_for:
668  case OMPD_parallel_for:
669  case OMPD_parallel_master:
670  case OMPD_parallel_sections:
671  case OMPD_for_simd:
672  case OMPD_parallel_for_simd:
673  case OMPD_cancel:
674  case OMPD_cancellation_point:
675  case OMPD_ordered:
676  case OMPD_threadprivate:
677  case OMPD_allocate:
678  case OMPD_task:
679  case OMPD_simd:
680  case OMPD_sections:
681  case OMPD_section:
682  case OMPD_single:
683  case OMPD_master:
684  case OMPD_critical:
685  case OMPD_taskyield:
686  case OMPD_barrier:
687  case OMPD_taskwait:
688  case OMPD_taskgroup:
689  case OMPD_atomic:
690  case OMPD_flush:
691  case OMPD_depobj:
692  case OMPD_scan:
693  case OMPD_teams:
694  case OMPD_target_data:
695  case OMPD_target_exit_data:
696  case OMPD_target_enter_data:
697  case OMPD_distribute:
698  case OMPD_distribute_simd:
699  case OMPD_distribute_parallel_for:
700  case OMPD_distribute_parallel_for_simd:
701  case OMPD_teams_distribute:
702  case OMPD_teams_distribute_simd:
703  case OMPD_teams_distribute_parallel_for:
704  case OMPD_teams_distribute_parallel_for_simd:
705  case OMPD_target_update:
706  case OMPD_declare_simd:
707  case OMPD_declare_variant:
708  case OMPD_begin_declare_variant:
709  case OMPD_end_declare_variant:
710  case OMPD_declare_target:
711  case OMPD_end_declare_target:
712  case OMPD_declare_reduction:
713  case OMPD_declare_mapper:
714  case OMPD_taskloop:
715  case OMPD_taskloop_simd:
716  case OMPD_master_taskloop:
717  case OMPD_master_taskloop_simd:
718  case OMPD_parallel_master_taskloop:
719  case OMPD_parallel_master_taskloop_simd:
720  case OMPD_requires:
721  case OMPD_unknown:
722  default:
723  break;
724  }
725  llvm_unreachable(
726  "Unknown programming model for OpenMP directive on NVPTX target.");
727 }
728 
729 void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
730  StringRef ParentName,
731  llvm::Function *&OutlinedFn,
732  llvm::Constant *&OutlinedFnID,
733  bool IsOffloadEntry,
734  const RegionCodeGenTy &CodeGen) {
735  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_NonSPMD);
736  EntryFunctionState EST;
737  WrapperFunctionsMap.clear();
738 
739  [[maybe_unused]] bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
740  assert(!IsBareKernel && "bare kernel should not be at generic mode");
741 
742  // Emit target region as a standalone region.
743  class NVPTXPrePostActionTy : public PrePostActionTy {
744  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
745  const OMPExecutableDirective &D;
746 
747  public:
748  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU::EntryFunctionState &EST,
749  const OMPExecutableDirective &D)
750  : EST(EST), D(D) {}
751  void Enter(CodeGenFunction &CGF) override {
752  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
753  RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ false);
754  // Skip target region initialization.
755  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
756  }
757  void Exit(CodeGenFunction &CGF) override {
758  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
759  RT.clearLocThreadIdInsertPt(CGF);
760  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ false);
761  }
762  } Action(EST, D);
763  CodeGen.setAction(Action);
764  IsInTTDRegion = true;
765  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
766  IsOffloadEntry, CodeGen);
767  IsInTTDRegion = false;
768 }
769 
770 void CGOpenMPRuntimeGPU::emitKernelInit(const OMPExecutableDirective &D,
771  CodeGenFunction &CGF,
772  EntryFunctionState &EST, bool IsSPMD) {
773  int32_t MinThreadsVal = 1, MaxThreadsVal = -1, MinTeamsVal = 1,
774  MaxTeamsVal = -1;
775  computeMinAndMaxThreadsAndTeams(D, CGF, MinThreadsVal, MaxThreadsVal,
776  MinTeamsVal, MaxTeamsVal);
777 
778  CGBuilderTy &Bld = CGF.Builder;
779  Bld.restoreIP(OMPBuilder.createTargetInit(
780  Bld, IsSPMD, MinThreadsVal, MaxThreadsVal, MinTeamsVal, MaxTeamsVal));
781  if (!IsSPMD)
782  emitGenericVarsProlog(CGF, EST.Loc);
783 }
784 
785 void CGOpenMPRuntimeGPU::emitKernelDeinit(CodeGenFunction &CGF,
786  EntryFunctionState &EST,
787  bool IsSPMD) {
788  if (!IsSPMD)
789  emitGenericVarsEpilog(CGF);
790 
791  // This is temporary until we remove the fixed sized buffer.
792  ASTContext &C = CGM.getContext();
793  RecordDecl *StaticRD = C.buildImplicitRecord(
794  "_openmp_teams_reduction_type_$_", RecordDecl::TagKind::Union);
795  StaticRD->startDefinition();
796  for (const RecordDecl *TeamReductionRec : TeamsReductions) {
797  QualType RecTy = C.getRecordType(TeamReductionRec);
798  auto *Field = FieldDecl::Create(
799  C, StaticRD, SourceLocation(), SourceLocation(), nullptr, RecTy,
800  C.getTrivialTypeSourceInfo(RecTy, SourceLocation()),
801  /*BW=*/nullptr, /*Mutable=*/false,
802  /*InitStyle=*/ICIS_NoInit);
803  Field->setAccess(AS_public);
804  StaticRD->addDecl(Field);
805  }
806  StaticRD->completeDefinition();
807  QualType StaticTy = C.getRecordType(StaticRD);
808  llvm::Type *LLVMReductionsBufferTy =
809  CGM.getTypes().ConvertTypeForMem(StaticTy);
810  const auto &DL = CGM.getModule().getDataLayout();
811  uint64_t ReductionDataSize =
812  TeamsReductions.empty()
813  ? 0
814  : DL.getTypeAllocSize(LLVMReductionsBufferTy).getFixedValue();
815  CGBuilderTy &Bld = CGF.Builder;
816  OMPBuilder.createTargetDeinit(Bld, ReductionDataSize,
817  C.getLangOpts().OpenMPCUDAReductionBufNum);
818  TeamsReductions.clear();
819 }
820 
821 void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
822  StringRef ParentName,
823  llvm::Function *&OutlinedFn,
824  llvm::Constant *&OutlinedFnID,
825  bool IsOffloadEntry,
826  const RegionCodeGenTy &CodeGen) {
827  ExecutionRuntimeModesRAII ModeRAII(CurrentExecutionMode, EM_SPMD);
828  EntryFunctionState EST;
829 
830  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
831 
832  // Emit target region as a standalone region.
833  class NVPTXPrePostActionTy : public PrePostActionTy {
834  CGOpenMPRuntimeGPU &RT;
835  CGOpenMPRuntimeGPU::EntryFunctionState &EST;
836  bool IsBareKernel;
837  DataSharingMode Mode;
838  const OMPExecutableDirective &D;
839 
840  public:
841  NVPTXPrePostActionTy(CGOpenMPRuntimeGPU &RT,
842  CGOpenMPRuntimeGPU::EntryFunctionState &EST,
843  bool IsBareKernel, const OMPExecutableDirective &D)
844  : RT(RT), EST(EST), IsBareKernel(IsBareKernel),
845  Mode(RT.CurrentDataSharingMode), D(D) {}
846  void Enter(CodeGenFunction &CGF) override {
847  if (IsBareKernel) {
848  RT.CurrentDataSharingMode = DataSharingMode::DS_CUDA;
849  return;
850  }
851  RT.emitKernelInit(D, CGF, EST, /* IsSPMD */ true);
852  // Skip target region initialization.
853  RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
854  }
855  void Exit(CodeGenFunction &CGF) override {
856  if (IsBareKernel) {
857  RT.CurrentDataSharingMode = Mode;
858  return;
859  }
860  RT.clearLocThreadIdInsertPt(CGF);
861  RT.emitKernelDeinit(CGF, EST, /* IsSPMD */ true);
862  }
863  } Action(*this, EST, IsBareKernel, D);
864  CodeGen.setAction(Action);
865  IsInTTDRegion = true;
866  emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
867  IsOffloadEntry, CodeGen);
868  IsInTTDRegion = false;
869 }
870 
871 void CGOpenMPRuntimeGPU::emitTargetOutlinedFunction(
872  const OMPExecutableDirective &D, StringRef ParentName,
873  llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
874  bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
875  if (!IsOffloadEntry) // Nothing to do.
876  return;
877 
878  assert(!ParentName.empty() && "Invalid target region parent name!");
879 
880  bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
881  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
882  if (Mode || IsBareKernel)
883  emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
884  CodeGen);
885  else
886  emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
887  CodeGen);
888 }
889 
891  : CGOpenMPRuntime(CGM) {
892  llvm::OpenMPIRBuilderConfig Config(
893  CGM.getLangOpts().OpenMPIsTargetDevice, isGPU(),
894  CGM.getLangOpts().OpenMPOffloadMandatory,
895  /*HasRequiresReverseOffload*/ false, /*HasRequiresUnifiedAddress*/ false,
896  hasRequiresUnifiedSharedMemory(), /*HasRequiresDynamicAllocators*/ false);
897  OMPBuilder.setConfig(Config);
898 
899  if (!CGM.getLangOpts().OpenMPIsTargetDevice)
900  llvm_unreachable("OpenMP can only handle device code.");
901 
902  if (CGM.getLangOpts().OpenMPCUDAMode)
903  CurrentDataSharingMode = CGOpenMPRuntimeGPU::DS_CUDA;
904 
905  llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
906  if (CGM.getLangOpts().NoGPULib || CGM.getLangOpts().OMPHostIRFile.empty())
907  return;
908 
909  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
910  "__omp_rtl_debug_kind");
911  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
912  "__omp_rtl_assume_teams_oversubscription");
913  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPThreadSubscription,
914  "__omp_rtl_assume_threads_oversubscription");
915  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoThreadState,
916  "__omp_rtl_assume_no_thread_state");
917  OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPNoNestedParallelism,
918  "__omp_rtl_assume_no_nested_parallelism");
919 }
920 
922  ProcBindKind ProcBind,
924  // Nothing to do.
925 }
926 
928  llvm::Value *NumThreads,
930  // Nothing to do.
931 }
932 
934  const Expr *NumTeams,
935  const Expr *ThreadLimit,
936  SourceLocation Loc) {}
937 
940  const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
941  const RegionCodeGenTy &CodeGen) {
942  // Emit target region as a standalone region.
943  bool PrevIsInTTDRegion = IsInTTDRegion;
944  IsInTTDRegion = false;
945  auto *OutlinedFun =
947  CGF, D, ThreadIDVar, InnermostKind, CodeGen));
948  IsInTTDRegion = PrevIsInTTDRegion;
949  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD) {
950  llvm::Function *WrapperFun =
951  createParallelDataSharingWrapper(OutlinedFun, D);
952  WrapperFunctionsMap[OutlinedFun] = WrapperFun;
953  }
954 
955  return OutlinedFun;
956 }
957 
958 /// Get list of lastprivate variables from the teams distribute ... or
959 /// teams {distribute ...} directives.
960 static void
964  "expected teams directive.");
965  const OMPExecutableDirective *Dir = &D;
968  Ctx,
970  /*IgnoreCaptured=*/true))) {
971  Dir = dyn_cast_or_null<OMPExecutableDirective>(S);
972  if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
973  Dir = nullptr;
974  }
975  }
976  if (!Dir)
977  return;
978  for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
979  for (const Expr *E : C->getVarRefs())
980  Vars.push_back(getPrivateItem(E));
981  }
982 }
983 
984 /// Get list of reduction variables from the teams ... directives.
985 static void
989  "expected teams directive.");
990  for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
991  for (const Expr *E : C->privates())
992  Vars.push_back(getPrivateItem(E));
993  }
994 }
995 
998  const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind,
999  const RegionCodeGenTy &CodeGen) {
1001 
1002  const RecordDecl *GlobalizedRD = nullptr;
1003  llvm::SmallVector<const ValueDecl *, 4> LastPrivatesReductions;
1004  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1005  unsigned WarpSize = CGM.getTarget().getGridValue().GV_Warp_Size;
1006  // Globalize team reductions variable unconditionally in all modes.
1007  if (getExecutionMode() != CGOpenMPRuntimeGPU::EM_SPMD)
1008  getTeamsReductionVars(CGM.getContext(), D, LastPrivatesReductions);
1009  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
1010  getDistributeLastprivateVars(CGM.getContext(), D, LastPrivatesReductions);
1011  if (!LastPrivatesReductions.empty()) {
1012  GlobalizedRD = ::buildRecordForGlobalizedVars(
1013  CGM.getContext(), std::nullopt, LastPrivatesReductions,
1014  MappedDeclsFields, WarpSize);
1015  }
1016  } else if (!LastPrivatesReductions.empty()) {
1017  assert(!TeamAndReductions.first &&
1018  "Previous team declaration is not expected.");
1019  TeamAndReductions.first = D.getCapturedStmt(OMPD_teams)->getCapturedDecl();
1020  std::swap(TeamAndReductions.second, LastPrivatesReductions);
1021  }
1022 
1023  // Emit target region as a standalone region.
1024  class NVPTXPrePostActionTy : public PrePostActionTy {
1026  const RecordDecl *GlobalizedRD;
1027  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1028  &MappedDeclsFields;
1029 
1030  public:
1031  NVPTXPrePostActionTy(
1032  SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1033  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1034  &MappedDeclsFields)
1035  : Loc(Loc), GlobalizedRD(GlobalizedRD),
1036  MappedDeclsFields(MappedDeclsFields) {}
1037  void Enter(CodeGenFunction &CGF) override {
1038  auto &Rt =
1039  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1040  if (GlobalizedRD) {
1041  auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1042  I->getSecond().MappedParams =
1043  std::make_unique<CodeGenFunction::OMPMapVars>();
1044  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1045  for (const auto &Pair : MappedDeclsFields) {
1046  assert(Pair.getFirst()->isCanonicalDecl() &&
1047  "Expected canonical declaration");
1048  Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
1049  }
1050  }
1051  Rt.emitGenericVarsProlog(CGF, Loc);
1052  }
1053  void Exit(CodeGenFunction &CGF) override {
1054  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
1055  .emitGenericVarsEpilog(CGF);
1056  }
1057  } Action(Loc, GlobalizedRD, MappedDeclsFields);
1058  CodeGen.setAction(Action);
1059  llvm::Function *OutlinedFun = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1060  CGF, D, ThreadIDVar, InnermostKind, CodeGen);
1061 
1062  return OutlinedFun;
1063 }
1064 
1065 void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
1066  SourceLocation Loc) {
1067  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1068  return;
1069 
1070  CGBuilderTy &Bld = CGF.Builder;
1071 
1072  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1073  if (I == FunctionGlobalizedDecls.end())
1074  return;
1075 
1076  for (auto &Rec : I->getSecond().LocalVarData) {
1077  const auto *VD = cast<VarDecl>(Rec.first);
1078  bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1079  QualType VarTy = VD->getType();
1080 
1081  // Get the local allocation of a firstprivate variable before sharing
1082  llvm::Value *ParValue;
1083  if (EscapedParam) {
1084  LValue ParLVal =
1085  CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1086  ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1087  }
1088 
1089  // Allocate space for the variable to be globalized
1090  llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1091  llvm::CallBase *VoidPtr =
1092  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1093  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1094  AllocArgs, VD->getName());
1095  // FIXME: We should use the variables actual alignment as an argument.
1096  VoidPtr->addRetAttr(llvm::Attribute::get(
1097  CGM.getLLVMContext(), llvm::Attribute::Alignment,
1099 
1100  // Cast the void pointer and get the address of the globalized variable.
1101  llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
1102  llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1103  VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
1104  LValue VarAddr =
1105  CGF.MakeNaturalAlignPointeeRawAddrLValue(CastedVoidPtr, VarTy);
1106  Rec.second.PrivateAddr = VarAddr.getAddress();
1107  Rec.second.GlobalizedVal = VoidPtr;
1108 
1109  // Assign the local allocation to the newly globalized location.
1110  if (EscapedParam) {
1111  CGF.EmitStoreOfScalar(ParValue, VarAddr);
1112  I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1113  }
1114  if (auto *DI = CGF.getDebugInfo())
1115  VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
1116  }
1117 
1118  for (const auto *ValueD : I->getSecond().EscapedVariableLengthDecls) {
1119  const auto *VD = cast<VarDecl>(ValueD);
1120  std::pair<llvm::Value *, llvm::Value *> AddrSizePair =
1121  getKmpcAllocShared(CGF, VD);
1122  I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(AddrSizePair);
1123  LValue Base = CGF.MakeAddrLValue(AddrSizePair.first, VD->getType(),
1124  CGM.getContext().getDeclAlign(VD),
1126  I->getSecond().MappedParams->setVarAddr(CGF, VD, Base.getAddress());
1127  }
1128  I->getSecond().MappedParams->apply(CGF);
1129 }
1130 
1132  const VarDecl *VD) const {
1133  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1134  if (I == FunctionGlobalizedDecls.end())
1135  return false;
1136 
1137  // Check variable declaration is delayed:
1138  return llvm::is_contained(I->getSecond().DelayedVariableLengthDecls, VD);
1139 }
1140 
1141 std::pair<llvm::Value *, llvm::Value *>
1143  const VarDecl *VD) {
1144  CGBuilderTy &Bld = CGF.Builder;
1145 
1146  // Compute size and alignment.
1147  llvm::Value *Size = CGF.getTypeSize(VD->getType());
1148  CharUnits Align = CGM.getContext().getDeclAlign(VD);
1149  Size = Bld.CreateNUWAdd(
1150  Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1151  llvm::Value *AlignVal =
1152  llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1153  Size = Bld.CreateUDiv(Size, AlignVal);
1154  Size = Bld.CreateNUWMul(Size, AlignVal);
1155 
1156  // Allocate space for this VLA object to be globalized.
1157  llvm::Value *AllocArgs[] = {Size};
1158  llvm::CallBase *VoidPtr =
1159  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1160  CGM.getModule(), OMPRTL___kmpc_alloc_shared),
1161  AllocArgs, VD->getName());
1162  VoidPtr->addRetAttr(llvm::Attribute::get(
1163  CGM.getLLVMContext(), llvm::Attribute::Alignment, Align.getQuantity()));
1164 
1165  return std::make_pair(VoidPtr, Size);
1166 }
1167 
1169  CodeGenFunction &CGF,
1170  const std::pair<llvm::Value *, llvm::Value *> &AddrSizePair) {
1171  // Deallocate the memory for each globalized VLA object
1172  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1173  CGM.getModule(), OMPRTL___kmpc_free_shared),
1174  {AddrSizePair.first, AddrSizePair.second});
1175 }
1176 
1177 void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1178  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
1179  return;
1180 
1181  const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1182  if (I != FunctionGlobalizedDecls.end()) {
1183  // Deallocate the memory for each globalized VLA object that was
1184  // globalized in the prolog (i.e. emitGenericVarsProlog).
1185  for (const auto &AddrSizePair :
1186  llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1187  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1188  CGM.getModule(), OMPRTL___kmpc_free_shared),
1189  {AddrSizePair.first, AddrSizePair.second});
1190  }
1191  // Deallocate the memory for each globalized value
1192  for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
1193  const auto *VD = cast<VarDecl>(Rec.first);
1194  I->getSecond().MappedParams->restore(CGF);
1195 
1196  llvm::Value *FreeArgs[] = {Rec.second.GlobalizedVal,
1197  CGF.getTypeSize(VD->getType())};
1198  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1199  CGM.getModule(), OMPRTL___kmpc_free_shared),
1200  FreeArgs);
1201  }
1202  }
1203 }
1204 
1206  const OMPExecutableDirective &D,
1208  llvm::Function *OutlinedFn,
1209  ArrayRef<llvm::Value *> CapturedVars) {
1210  if (!CGF.HaveInsertPoint())
1211  return;
1212 
1213  bool IsBareKernel = D.getSingleClause<OMPXBareClause>();
1214 
1215  RawAddress ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
1216  /*Name=*/".zero.addr");
1217  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
1218  llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1219  // We don't emit any thread id function call in bare kernel, but because the
1220  // outlined function has a pointer argument, we emit a nullptr here.
1221  if (IsBareKernel)
1222  OutlinedFnArgs.push_back(llvm::ConstantPointerNull::get(CGM.VoidPtrTy));
1223  else
1224  OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).emitRawPointer(CGF));
1225  OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1226  OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
1227  emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
1228 }
1229 
1232  llvm::Function *OutlinedFn,
1233  ArrayRef<llvm::Value *> CapturedVars,
1234  const Expr *IfCond,
1235  llvm::Value *NumThreads) {
1236  if (!CGF.HaveInsertPoint())
1237  return;
1238 
1239  auto &&ParallelGen = [this, Loc, OutlinedFn, CapturedVars, IfCond,
1240  NumThreads](CodeGenFunction &CGF,
1241  PrePostActionTy &Action) {
1242  CGBuilderTy &Bld = CGF.Builder;
1243  llvm::Value *NumThreadsVal = NumThreads;
1244  llvm::Function *WFn = WrapperFunctionsMap[OutlinedFn];
1245  llvm::Value *ID = llvm::ConstantPointerNull::get(CGM.Int8PtrTy);
1246  if (WFn)
1247  ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1248  llvm::Value *FnPtr = Bld.CreateBitOrPointerCast(OutlinedFn, CGM.Int8PtrTy);
1249 
1250  // Create a private scope that will globalize the arguments
1251  // passed from the outside of the target region.
1252  // TODO: Is that needed?
1253  CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1254 
1255  Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
1256  llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
1257  "captured_vars_addrs");
1258  // There's something to share.
1259  if (!CapturedVars.empty()) {
1260  // Prepare for parallel region. Indicate the outlined function.
1261  ASTContext &Ctx = CGF.getContext();
1262  unsigned Idx = 0;
1263  for (llvm::Value *V : CapturedVars) {
1264  Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
1265  llvm::Value *PtrV;
1266  if (V->getType()->isIntegerTy())
1267  PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1268  else
1270  CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1271  Ctx.getPointerType(Ctx.VoidPtrTy));
1272  ++Idx;
1273  }
1274  }
1275 
1276  llvm::Value *IfCondVal = nullptr;
1277  if (IfCond)
1278  IfCondVal = Bld.CreateIntCast(CGF.EvaluateExprAsBool(IfCond), CGF.Int32Ty,
1279  /* isSigned */ false);
1280  else
1281  IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1282 
1283  if (!NumThreadsVal)
1284  NumThreadsVal = llvm::ConstantInt::get(CGF.Int32Ty, -1);
1285  else
1286  NumThreadsVal = Bld.CreateZExtOrTrunc(NumThreadsVal, CGF.Int32Ty),
1287 
1288  assert(IfCondVal && "Expected a value");
1289  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
1290  llvm::Value *Args[] = {
1291  RTLoc,
1292  getThreadID(CGF, Loc),
1293  IfCondVal,
1294  NumThreadsVal,
1295  llvm::ConstantInt::get(CGF.Int32Ty, -1),
1296  FnPtr,
1297  ID,
1298  Bld.CreateBitOrPointerCast(CapturedVarsAddrs.emitRawPointer(CGF),
1299  CGF.VoidPtrPtrTy),
1300  llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1301  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1302  CGM.getModule(), OMPRTL___kmpc_parallel_51),
1303  Args);
1304  };
1305 
1306  RegionCodeGenTy RCG(ParallelGen);
1307  RCG(CGF);
1308 }
1309 
1310 void CGOpenMPRuntimeGPU::syncCTAThreads(CodeGenFunction &CGF) {
1311  // Always emit simple barriers!
1312  if (!CGF.HaveInsertPoint())
1313  return;
1314  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
1315  // This function does not use parameters, so we can emit just default values.
1316  llvm::Value *Args[] = {
1317  llvm::ConstantPointerNull::get(
1318  cast<llvm::PointerType>(getIdentTyPointerTy())),
1319  llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
1320  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1321  CGM.getModule(), OMPRTL___kmpc_barrier_simple_spmd),
1322  Args);
1323 }
1324 
1327  OpenMPDirectiveKind Kind, bool,
1328  bool) {
1329  // Always emit simple barriers!
1330  if (!CGF.HaveInsertPoint())
1331  return;
1332  // Build call __kmpc_cancel_barrier(loc, thread_id);
1333  unsigned Flags = getDefaultFlagsForBarriers(Kind);
1334  llvm::Value *Args[] = {emitUpdateLocation(CGF, Loc, Flags),
1335  getThreadID(CGF, Loc)};
1336 
1337  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1338  CGM.getModule(), OMPRTL___kmpc_barrier),
1339  Args);
1340 }
1341 
1343  CodeGenFunction &CGF, StringRef CriticalName,
1344  const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
1345  const Expr *Hint) {
1346  llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
1347  llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
1348  llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
1349  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
1350  llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
1351 
1352  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1353 
1354  // Get the mask of active threads in the warp.
1355  llvm::Value *Mask = CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1356  CGM.getModule(), OMPRTL___kmpc_warp_active_thread_mask));
1357  // Fetch team-local id of the thread.
1358  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1359 
1360  // Get the width of the team.
1361  llvm::Value *TeamWidth = RT.getGPUNumThreads(CGF);
1362 
1363  // Initialize the counter variable for the loop.
1364  QualType Int32Ty =
1365  CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
1366  Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
1367  LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
1368  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
1369  /*isInit=*/true);
1370 
1371  // Block checks if loop counter exceeds upper bound.
1372  CGF.EmitBlock(LoopBB);
1373  llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1374  llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
1375  CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
1376 
1377  // Block tests which single thread should execute region, and which threads
1378  // should go straight to synchronisation point.
1379  CGF.EmitBlock(TestBB);
1380  CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
1381  llvm::Value *CmpThreadToCounter =
1382  CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
1383  CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
1384 
1385  // Block emits the body of the critical region.
1386  CGF.EmitBlock(BodyBB);
1387 
1388  // Output the critical statement.
1389  CGOpenMPRuntime::emitCriticalRegion(CGF, CriticalName, CriticalOpGen, Loc,
1390  Hint);
1391 
1392  // After the body surrounded by the critical region, the single executing
1393  // thread will jump to the synchronisation point.
1394  // Block waits for all threads in current team to finish then increments the
1395  // counter variable and returns to the loop.
1396  CGF.EmitBlock(SyncBB);
1397  // Reconverge active threads in the warp.
1398  (void)CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1399  CGM.getModule(), OMPRTL___kmpc_syncwarp),
1400  Mask);
1401 
1402  llvm::Value *IncCounterVal =
1403  CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
1404  CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
1405  CGF.EmitBranch(LoopBB);
1406 
1407  // Block that is reached when all threads in the team complete the region.
1408  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
1409 }
1410 
1411 /// Cast value to the specified type.
1412 static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1413  QualType ValTy, QualType CastTy,
1414  SourceLocation Loc) {
1415  assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1416  "Cast type must sized.");
1417  assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1418  "Val type must sized.");
1419  llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1420  if (ValTy == CastTy)
1421  return Val;
1422  if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1423  CGF.getContext().getTypeSizeInChars(CastTy))
1424  return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1425  if (CastTy->isIntegerType() && ValTy->isIntegerType())
1426  return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1427  CastTy->hasSignedIntegerRepresentation());
1428  Address CastItem = CGF.CreateMemTemp(CastTy);
1429  Address ValCastItem = CastItem.withElementType(Val->getType());
1430  CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy,
1432  TBAAAccessInfo());
1433  return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc,
1435  TBAAAccessInfo());
1436 }
1437 
1438 /// This function creates calls to one of two shuffle functions to copy
1439 /// variables between lanes in a warp.
1441  llvm::Value *Elem,
1442  QualType ElemType,
1443  llvm::Value *Offset,
1444  SourceLocation Loc) {
1445  CodeGenModule &CGM = CGF.CGM;
1446  CGBuilderTy &Bld = CGF.Builder;
1447  CGOpenMPRuntimeGPU &RT =
1448  *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
1449  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
1450 
1451  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1452  assert(Size.getQuantity() <= 8 &&
1453  "Unsupported bitwidth in shuffle instruction.");
1454 
1455  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
1456  ? OMPRTL___kmpc_shuffle_int32
1457  : OMPRTL___kmpc_shuffle_int64;
1458 
1459  // Cast all types to 32- or 64-bit values before calling shuffle routines.
1460  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1461  Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1462  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
1463  llvm::Value *WarpSize =
1464  Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
1465 
1466  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
1467  OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
1468  {ElemCast, Offset, WarpSize});
1469 
1470  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
1471 }
1472 
1473 static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
1474  Address DestAddr, QualType ElemType,
1475  llvm::Value *Offset, SourceLocation Loc) {
1476  CGBuilderTy &Bld = CGF.Builder;
1477 
1478  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1479  // Create the loop over the big sized data.
1480  // ptr = (void*)Elem;
1481  // ptrEnd = (void*) Elem + 1;
1482  // Step = 8;
1483  // while (ptr + Step < ptrEnd)
1484  // shuffle((int64_t)*ptr);
1485  // Step = 4;
1486  // while (ptr + Step < ptrEnd)
1487  // shuffle((int32_t)*ptr);
1488  // ...
1489  Address ElemPtr = DestAddr;
1490  Address Ptr = SrcAddr;
1492  Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
1493  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
1494  if (Size < CharUnits::fromQuantity(IntSize))
1495  continue;
1496  QualType IntType = CGF.getContext().getIntTypeForBitwidth(
1497  CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
1498  /*Signed=*/1);
1499  llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
1500  Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
1501  IntTy);
1503  ElemPtr, IntTy->getPointerTo(), IntTy);
1504  if (Size.getQuantity() / IntSize > 1) {
1505  llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
1506  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
1507  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
1508  llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
1509  CGF.EmitBlock(PreCondBB);
1510  llvm::PHINode *PhiSrc =
1511  Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
1512  PhiSrc->addIncoming(Ptr.emitRawPointer(CGF), CurrentBB);
1513  llvm::PHINode *PhiDest =
1514  Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
1515  PhiDest->addIncoming(ElemPtr.emitRawPointer(CGF), CurrentBB);
1516  Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
1517  ElemPtr =
1518  Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
1519  llvm::Value *PtrEndRaw = PtrEnd.emitRawPointer(CGF);
1520  llvm::Value *PtrRaw = Ptr.emitRawPointer(CGF);
1521  llvm::Value *PtrDiff = Bld.CreatePtrDiff(
1522  CGF.Int8Ty, PtrEndRaw,
1524  Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
1525  ThenBB, ExitBB);
1526  CGF.EmitBlock(ThenBB);
1527  llvm::Value *Res = createRuntimeShuffleFunction(
1528  CGF,
1529  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1531  TBAAAccessInfo()),
1532  IntType, Offset, Loc);
1533  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1535  TBAAAccessInfo());
1536  Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
1537  Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1538  PhiSrc->addIncoming(LocalPtr.emitRawPointer(CGF), ThenBB);
1539  PhiDest->addIncoming(LocalElemPtr.emitRawPointer(CGF), ThenBB);
1540  CGF.EmitBranch(PreCondBB);
1541  CGF.EmitBlock(ExitBB);
1542  } else {
1543  llvm::Value *Res = createRuntimeShuffleFunction(
1544  CGF,
1545  CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
1547  TBAAAccessInfo()),
1548  IntType, Offset, Loc);
1549  CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
1551  TBAAAccessInfo());
1552  Ptr = Bld.CreateConstGEP(Ptr, 1);
1553  ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
1554  }
1555  Size = Size % IntSize;
1556  }
1557 }
1558 
1559 namespace {
1560 enum CopyAction : unsigned {
1561  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1562  // the warp using shuffle instructions.
1563  RemoteLaneToThread,
1564  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1565  ThreadCopy,
1566 };
1567 } // namespace
1568 
1570  llvm::Value *RemoteLaneOffset;
1571  llvm::Value *ScratchpadIndex;
1572  llvm::Value *ScratchpadWidth;
1573 };
1574 
1575 /// Emit instructions to copy a Reduce list, which contains partially
1576 /// aggregated values, in the specified direction.
1578  CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1579  ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1580  CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
1581 
1582  CodeGenModule &CGM = CGF.CGM;
1583  ASTContext &C = CGM.getContext();
1584  CGBuilderTy &Bld = CGF.Builder;
1585 
1586  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1587 
1588  // Iterates, element-by-element, through the source Reduce list and
1589  // make a copy.
1590  unsigned Idx = 0;
1591  for (const Expr *Private : Privates) {
1592  Address SrcElementAddr = Address::invalid();
1593  Address DestElementAddr = Address::invalid();
1594  Address DestElementPtrAddr = Address::invalid();
1595  // Should we shuffle in an element from a remote lane?
1596  bool ShuffleInElement = false;
1597  // Set to true to update the pointer in the dest Reduce list to a
1598  // newly created element.
1599  bool UpdateDestListPtr = false;
1600  QualType PrivatePtrType = C.getPointerType(Private->getType());
1601  llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
1602 
1603  switch (Action) {
1604  case RemoteLaneToThread: {
1605  // Step 1.1: Get the address for the src element in the Reduce list.
1606  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1607  SrcElementAddr = CGF.EmitLoadOfPointer(
1608  SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1609  PrivatePtrType->castAs<PointerType>());
1610 
1611  // Step 1.2: Create a temporary to store the element in the destination
1612  // Reduce list.
1613  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1614  DestElementAddr =
1615  CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1616  ShuffleInElement = true;
1617  UpdateDestListPtr = true;
1618  break;
1619  }
1620  case ThreadCopy: {
1621  // Step 1.1: Get the address for the src element in the Reduce list.
1622  Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
1623  SrcElementAddr = CGF.EmitLoadOfPointer(
1624  SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
1625  PrivatePtrType->castAs<PointerType>());
1626 
1627  // Step 1.2: Get the address for dest element. The destination
1628  // element has already been created on the thread's stack.
1629  DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
1630  DestElementAddr = CGF.EmitLoadOfPointer(
1631  DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
1632  PrivatePtrType->castAs<PointerType>());
1633  break;
1634  }
1635  }
1636 
1637  // Regardless of src and dest of copy, we emit the load of src
1638  // element as this is required in all directions
1639  SrcElementAddr = SrcElementAddr.withElementType(
1640  CGF.ConvertTypeForMem(Private->getType()));
1641  DestElementAddr =
1642  DestElementAddr.withElementType(SrcElementAddr.getElementType());
1643 
1644  // Now that all active lanes have read the element in the
1645  // Reduce list, shuffle over the value from the remote lane.
1646  if (ShuffleInElement) {
1647  shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
1648  RemoteLaneOffset, Private->getExprLoc());
1649  } else {
1650  switch (CGF.getEvaluationKind(Private->getType())) {
1651  case TEK_Scalar: {
1652  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1653  SrcElementAddr, /*Volatile=*/false, Private->getType(),
1655  TBAAAccessInfo());
1656  // Store the source element value to the dest element address.
1657  CGF.EmitStoreOfScalar(
1658  Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
1660  break;
1661  }
1662  case TEK_Complex: {
1664  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1665  Private->getExprLoc());
1666  CGF.EmitStoreOfComplex(
1667  Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1668  /*isInit=*/false);
1669  break;
1670  }
1671  case TEK_Aggregate:
1672  CGF.EmitAggregateCopy(
1673  CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
1674  CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
1676  break;
1677  }
1678  }
1679 
1680  // Step 3.1: Modify reference in dest Reduce list as needed.
1681  // Modifying the reference in Reduce list to point to the newly
1682  // created element. The element is live in the current function
1683  // scope and that of functions it invokes (i.e., reduce_function).
1684  // RemoteReduceData[i] = (void*)&RemoteElem
1685  if (UpdateDestListPtr) {
1686  CGF.EmitStoreOfScalar(
1688  DestElementAddr.emitRawPointer(CGF), CGF.VoidPtrTy),
1689  DestElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy);
1690  }
1691 
1692  ++Idx;
1693  }
1694 }
1695 
1696 /// This function emits a helper that gathers Reduce lists from the first
1697 /// lane of every active warp to lanes in the first warp.
1698 ///
1699 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1700 /// shared smem[warp_size];
1701 /// For all data entries D in reduce_data:
1702 /// sync
1703 /// If (I am the first lane in each warp)
1704 /// Copy my local D to smem[warp_id]
1705 /// sync
1706 /// if (I am the first warp)
1707 /// Copy smem[thread_id] to my local D
1708 static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1709  ArrayRef<const Expr *> Privates,
1710  QualType ReductionArrayTy,
1711  SourceLocation Loc) {
1712  ASTContext &C = CGM.getContext();
1713  llvm::Module &M = CGM.getModule();
1714 
1715  // ReduceList: thread local Reduce list.
1716  // At the stage of the computation when this function is called, partially
1717  // aggregated values reside in the first lane of every active warp.
1718  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1719  C.VoidPtrTy, ImplicitParamKind::Other);
1720  // NumWarps: number of warps active in the parallel region. This could
1721  // be smaller than 32 (max warps in a CTA) for partial block reduction.
1722  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1723  C.getIntTypeForBitwidth(32, /* Signed */ true),
1725  FunctionArgList Args;
1726  Args.push_back(&ReduceListArg);
1727  Args.push_back(&NumWarpsArg);
1728 
1729  const CGFunctionInfo &CGFI =
1730  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1731  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
1732  llvm::GlobalValue::InternalLinkage,
1733  "_omp_reduction_inter_warp_copy_func", &M);
1734  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
1735  Fn->setDoesNotRecurse();
1736  CodeGenFunction CGF(CGM);
1737  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
1738 
1739  CGBuilderTy &Bld = CGF.Builder;
1740 
1741  // This array is used as a medium to transfer, one reduce element at a time,
1742  // the data from the first lane of every warp to lanes in the first warp
1743  // in order to perform the final step of a reduction in a parallel region
1744  // (reduction across warps). The array is placed in NVPTX __shared__ memory
1745  // for reduced latency, as well as to have a distinct copy for concurrently
1746  // executing target regions. The array is declared with common linkage so
1747  // as to be shared across compilation units.
1748  StringRef TransferMediumName =
1749  "__openmp_nvptx_data_transfer_temporary_storage";
1750  llvm::GlobalVariable *TransferMedium =
1751  M.getGlobalVariable(TransferMediumName);
1752  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
1753  if (!TransferMedium) {
1754  auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
1755  unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1756  TransferMedium = new llvm::GlobalVariable(
1757  M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
1758  llvm::UndefValue::get(Ty), TransferMediumName,
1759  /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1760  SharedAddressSpace);
1761  CGM.addCompilerUsedGlobal(TransferMedium);
1762  }
1763 
1764  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
1765  // Get the CUDA thread id of the current OpenMP thread on the GPU.
1766  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
1767  // nvptx_lane_id = nvptx_id % warpsize
1768  llvm::Value *LaneID = getNVPTXLaneID(CGF);
1769  // nvptx_warp_id = nvptx_id / warpsize
1770  llvm::Value *WarpID = getNVPTXWarpID(CGF);
1771 
1772  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1773  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
1774  Address LocalReduceList(
1776  CGF.EmitLoadOfScalar(
1777  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
1779  ElemTy->getPointerTo()),
1780  ElemTy, CGF.getPointerAlign());
1781 
1782  unsigned Idx = 0;
1783  for (const Expr *Private : Privates) {
1784  //
1785  // Warp master copies reduce element to transfer medium in __shared__
1786  // memory.
1787  //
1788  unsigned RealTySize =
1789  C.getTypeSizeInChars(Private->getType())
1790  .alignTo(C.getTypeAlignInChars(Private->getType()))
1791  .getQuantity();
1792  for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
1793  unsigned NumIters = RealTySize / TySize;
1794  if (NumIters == 0)
1795  continue;
1796  QualType CType = C.getIntTypeForBitwidth(
1797  C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
1798  llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
1799  CharUnits Align = CharUnits::fromQuantity(TySize);
1800  llvm::Value *Cnt = nullptr;
1801  Address CntAddr = Address::invalid();
1802  llvm::BasicBlock *PrecondBB = nullptr;
1803  llvm::BasicBlock *ExitBB = nullptr;
1804  if (NumIters > 1) {
1805  CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
1806  CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
1807  /*Volatile=*/false, C.IntTy);
1808  PrecondBB = CGF.createBasicBlock("precond");
1809  ExitBB = CGF.createBasicBlock("exit");
1810  llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
1811  // There is no need to emit line number for unconditional branch.
1813  CGF.EmitBlock(PrecondBB);
1814  Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
1815  llvm::Value *Cmp =
1816  Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
1817  Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
1818  CGF.EmitBlock(BodyBB);
1819  }
1820  // kmpc_barrier.
1821  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1822  /*EmitChecks=*/false,
1823  /*ForceSimpleCall=*/true);
1824  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1825  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1826  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1827 
1828  // if (lane_id == 0)
1829  llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
1830  Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1831  CGF.EmitBlock(ThenBB);
1832 
1833  // Reduce element = LocalReduceList[i]
1834  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1835  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1836  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1837  // elemptr = ((CopyType*)(elemptrptr)) + I
1838  Address ElemPtr(ElemPtrPtr, CopyType, Align);
1839  if (NumIters > 1)
1840  ElemPtr = Bld.CreateGEP(CGF, ElemPtr, Cnt);
1841 
1842  // Get pointer to location in transfer medium.
1843  // MediumPtr = &medium[warp_id]
1844  llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1845  TransferMedium->getValueType(), TransferMedium,
1846  {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1847  // Casting to actual data type.
1848  // MediumPtr = (CopyType*)MediumPtrAddr;
1849  Address MediumPtr(MediumPtrVal, CopyType, Align);
1850 
1851  // elem = *elemptr
1852  //*MediumPtr = elem
1853  llvm::Value *Elem = CGF.EmitLoadOfScalar(
1854  ElemPtr, /*Volatile=*/false, CType, Loc,
1856  // Store the source element value to the dest element address.
1857  CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/true, CType,
1859  TBAAAccessInfo());
1860 
1861  Bld.CreateBr(MergeBB);
1862 
1863  CGF.EmitBlock(ElseBB);
1864  Bld.CreateBr(MergeBB);
1865 
1866  CGF.EmitBlock(MergeBB);
1867 
1868  // kmpc_barrier.
1869  CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
1870  /*EmitChecks=*/false,
1871  /*ForceSimpleCall=*/true);
1872 
1873  //
1874  // Warp 0 copies reduce element from transfer medium.
1875  //
1876  llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1877  llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1878  llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1879 
1880  Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1881  llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1882  AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, Loc);
1883 
1884  // Up to 32 threads in warp 0 are active.
1885  llvm::Value *IsActiveThread =
1886  Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1887  Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1888 
1889  CGF.EmitBlock(W0ThenBB);
1890 
1891  // SrcMediumPtr = &medium[tid]
1892  llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1893  TransferMedium->getValueType(), TransferMedium,
1894  {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1895  // SrcMediumVal = *SrcMediumPtr;
1896  Address SrcMediumPtr(SrcMediumPtrVal, CopyType, Align);
1897 
1898  // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
1899  Address TargetElemPtrPtr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
1900  llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1901  TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, Loc);
1902  Address TargetElemPtr(TargetElemPtrVal, CopyType, Align);
1903  if (NumIters > 1)
1904  TargetElemPtr = Bld.CreateGEP(CGF, TargetElemPtr, Cnt);
1905 
1906  // *TargetElemPtr = SrcMediumVal;
1907  llvm::Value *SrcMediumValue =
1908  CGF.EmitLoadOfScalar(SrcMediumPtr, /*Volatile=*/true, CType, Loc);
1909  CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1910  CType);
1911  Bld.CreateBr(W0MergeBB);
1912 
1913  CGF.EmitBlock(W0ElseBB);
1914  Bld.CreateBr(W0MergeBB);
1915 
1916  CGF.EmitBlock(W0MergeBB);
1917 
1918  if (NumIters > 1) {
1919  Cnt = Bld.CreateNSWAdd(Cnt, llvm::ConstantInt::get(CGM.IntTy, /*V=*/1));
1920  CGF.EmitStoreOfScalar(Cnt, CntAddr, /*Volatile=*/false, C.IntTy);
1921  CGF.EmitBranch(PrecondBB);
1923  CGF.EmitBlock(ExitBB);
1924  }
1925  RealTySize %= TySize;
1926  }
1927  ++Idx;
1928  }
1929 
1930  CGF.FinishFunction();
1931  return Fn;
1932 }
1933 
1934 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1935 /// in the same warp. It uses shuffle instructions to copy over data from
1936 /// a remote lane's stack. The reduction algorithm performed is specified
1937 /// by the fourth parameter.
1938 ///
1939 /// Algorithm Versions.
1940 /// Full Warp Reduce (argument value 0):
1941 /// This algorithm assumes that all 32 lanes are active and gathers
1942 /// data from these 32 lanes, producing a single resultant value.
1943 /// Contiguous Partial Warp Reduce (argument value 1):
1944 /// This algorithm assumes that only a *contiguous* subset of lanes
1945 /// are active. This happens for the last warp in a parallel region
1946 /// when the user specified num_threads is not an integer multiple of
1947 /// 32. This contiguous subset always starts with the zeroth lane.
1948 /// Partial Warp Reduce (argument value 2):
1949 /// This algorithm gathers data from any number of lanes at any position.
1950 /// All reduced values are stored in the lowest possible lane. The set
1951 /// of problems every algorithm addresses is a super set of those
1952 /// addressable by algorithms with a lower version number. Overhead
1953 /// increases as algorithm version increases.
1954 ///
1955 /// Terminology
1956 /// Reduce element:
1957 /// Reduce element refers to the individual data field with primitive
1958 /// data types to be combined and reduced across threads.
1959 /// Reduce list:
1960 /// Reduce list refers to a collection of local, thread-private
1961 /// reduce elements.
1962 /// Remote Reduce list:
1963 /// Remote Reduce list refers to a collection of remote (relative to
1964 /// the current thread) reduce elements.
1965 ///
1966 /// We distinguish between three states of threads that are important to
1967 /// the implementation of this function.
1968 /// Alive threads:
1969 /// Threads in a warp executing the SIMT instruction, as distinguished from
1970 /// threads that are inactive due to divergent control flow.
1971 /// Active threads:
1972 /// The minimal set of threads that has to be alive upon entry to this
1973 /// function. The computation is correct iff active threads are alive.
1974 /// Some threads are alive but they are not active because they do not
1975 /// contribute to the computation in any useful manner. Turning them off
1976 /// may introduce control flow overheads without any tangible benefits.
1977 /// Effective threads:
1978 /// In order to comply with the argument requirements of the shuffle
1979 /// function, we must keep all lanes holding data alive. But at most
1980 /// half of them perform value aggregation; we refer to this half of
1981 /// threads as effective. The other half is simply handing off their
1982 /// data.
1983 ///
1984 /// Procedure
1985 /// Value shuffle:
1986 /// In this step active threads transfer data from higher lane positions
1987 /// in the warp to lower lane positions, creating Remote Reduce list.
1988 /// Value aggregation:
1989 /// In this step, effective threads combine their thread local Reduce list
1990 /// with Remote Reduce list and store the result in the thread local
1991 /// Reduce list.
1992 /// Value copy:
1993 /// In this step, we deal with the assumption made by algorithm 2
1994 /// (i.e. contiguity assumption). When we have an odd number of lanes
1995 /// active, say 2k+1, only k threads will be effective and therefore k
1996 /// new values will be produced. However, the Reduce list owned by the
1997 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1998 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1999 /// that the contiguity assumption still holds.
2000 static llvm::Function *emitShuffleAndReduceFunction(
2001  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2002  QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc) {
2003  ASTContext &C = CGM.getContext();
2004 
2005  // Thread local Reduce list used to host the values of data to be reduced.
2006  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2007  C.VoidPtrTy, ImplicitParamKind::Other);
2008  // Current lane id; could be logical.
2009  ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2011  // Offset of the remote source lane relative to the current lane.
2012  ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2013  C.ShortTy, ImplicitParamKind::Other);
2014  // Algorithm version. This is expected to be known at compile time.
2015  ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2016  C.ShortTy, ImplicitParamKind::Other);
2017  FunctionArgList Args;
2018  Args.push_back(&ReduceListArg);
2019  Args.push_back(&LaneIDArg);
2020  Args.push_back(&RemoteLaneOffsetArg);
2021  Args.push_back(&AlgoVerArg);
2022 
2023  const CGFunctionInfo &CGFI =
2024  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2025  auto *Fn = llvm::Function::Create(
2026  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2027  "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
2028  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2029  Fn->setDoesNotRecurse();
2030 
2031  CodeGenFunction CGF(CGM);
2032  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2033 
2034  CGBuilderTy &Bld = CGF.Builder;
2035 
2036  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2037  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2038  Address LocalReduceList(
2040  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2041  C.VoidPtrTy, SourceLocation()),
2042  ElemTy->getPointerTo()),
2043  ElemTy, CGF.getPointerAlign());
2044 
2045  Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2046  llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2047  AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2048 
2049  Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2050  llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2051  AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2052 
2053  Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2054  llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2055  AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2056 
2057  // Create a local thread-private variable to host the Reduce list
2058  // from a remote lane.
2059  Address RemoteReduceList =
2060  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2061 
2062  // This loop iterates through the list of reduce elements and copies,
2063  // element by element, from a remote lane in the warp to RemoteReduceList,
2064  // hosted on the thread's stack.
2065  emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2066  LocalReduceList, RemoteReduceList,
2067  {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2068  /*ScratchpadIndex=*/nullptr,
2069  /*ScratchpadWidth=*/nullptr});
2070 
2071  // The actions to be performed on the Remote Reduce list is dependent
2072  // on the algorithm version.
2073  //
2074  // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2075  // LaneId % 2 == 0 && Offset > 0):
2076  // do the reduction value aggregation
2077  //
2078  // The thread local variable Reduce list is mutated in place to host the
2079  // reduced data, which is the aggregated value produced from local and
2080  // remote lanes.
2081  //
2082  // Note that AlgoVer is expected to be a constant integer known at compile
2083  // time.
2084  // When AlgoVer==0, the first conjunction evaluates to true, making
2085  // the entire predicate true during compile time.
2086  // When AlgoVer==1, the second conjunction has only the second part to be
2087  // evaluated during runtime. Other conjunctions evaluates to false
2088  // during compile time.
2089  // When AlgoVer==2, the third conjunction has only the second part to be
2090  // evaluated during runtime. Other conjunctions evaluates to false
2091  // during compile time.
2092  llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
2093 
2094  llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2095  llvm::Value *CondAlgo1 = Bld.CreateAnd(
2096  Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2097 
2098  llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2099  llvm::Value *CondAlgo2 = Bld.CreateAnd(
2100  Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
2101  CondAlgo2 = Bld.CreateAnd(
2102  CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2103 
2104  llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2105  CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2106 
2107  llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2108  llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2109  llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2110  Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2111 
2112  CGF.EmitBlock(ThenBB);
2113  // reduce_function(LocalReduceList, RemoteReduceList)
2114  llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2115  LocalReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
2116  llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2117  RemoteReduceList.emitRawPointer(CGF), CGF.VoidPtrTy);
2119  CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
2120  Bld.CreateBr(MergeBB);
2121 
2122  CGF.EmitBlock(ElseBB);
2123  Bld.CreateBr(MergeBB);
2124 
2125  CGF.EmitBlock(MergeBB);
2126 
2127  // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2128  // Reduce list.
2129  Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2130  llvm::Value *CondCopy = Bld.CreateAnd(
2131  Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2132 
2133  llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2134  llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2135  llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2136  Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2137 
2138  CGF.EmitBlock(CpyThenBB);
2139  emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2140  RemoteReduceList, LocalReduceList);
2141  Bld.CreateBr(CpyMergeBB);
2142 
2143  CGF.EmitBlock(CpyElseBB);
2144  Bld.CreateBr(CpyMergeBB);
2145 
2146  CGF.EmitBlock(CpyMergeBB);
2147 
2148  CGF.FinishFunction();
2149  return Fn;
2150 }
2151 
2152 /// This function emits a helper that copies all the reduction variables from
2153 /// the team into the provided global buffer for the reduction variables.
2154 ///
2155 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2156 /// For all data entries D in reduce_data:
2157 /// Copy local D to buffer.D[Idx]
2158 static llvm::Value *emitListToGlobalCopyFunction(
2159  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2160  QualType ReductionArrayTy, SourceLocation Loc,
2161  const RecordDecl *TeamReductionRec,
2162  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2163  &VarFieldMap) {
2164  ASTContext &C = CGM.getContext();
2165 
2166  // Buffer: global reduction buffer.
2167  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2168  C.VoidPtrTy, ImplicitParamKind::Other);
2169  // Idx: index of the buffer.
2170  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2172  // ReduceList: thread local Reduce list.
2173  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2174  C.VoidPtrTy, ImplicitParamKind::Other);
2175  FunctionArgList Args;
2176  Args.push_back(&BufferArg);
2177  Args.push_back(&IdxArg);
2178  Args.push_back(&ReduceListArg);
2179 
2180  const CGFunctionInfo &CGFI =
2181  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2182  auto *Fn = llvm::Function::Create(
2183  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2184  "_omp_reduction_list_to_global_copy_func", &CGM.getModule());
2185  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2186  Fn->setDoesNotRecurse();
2187  CodeGenFunction CGF(CGM);
2188  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2189 
2190  CGBuilderTy &Bld = CGF.Builder;
2191 
2192  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2193  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2194  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2195  Address LocalReduceList(
2197  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2198  C.VoidPtrTy, Loc),
2199  ElemTy->getPointerTo()),
2200  ElemTy, CGF.getPointerAlign());
2201  QualType StaticTy = C.getRecordType(TeamReductionRec);
2202  llvm::Type *LLVMReductionsBufferTy =
2203  CGM.getTypes().ConvertTypeForMem(StaticTy);
2204  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2205  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2206  LLVMReductionsBufferTy->getPointerTo());
2207  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2208  /*Volatile=*/false, C.IntTy,
2209  Loc)};
2210  unsigned Idx = 0;
2211  for (const Expr *Private : Privates) {
2212  // Reduce element = LocalReduceList[i]
2213  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2214  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2215  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2216  // elemptr = ((CopyType*)(elemptrptr)) + I
2217  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2218  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2219  ElemPtrPtr, ElemTy->getPointerTo());
2220  Address ElemPtr =
2221  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2222  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2223  // Global = Buffer.VD[Idx];
2224  const FieldDecl *FD = VarFieldMap.lookup(VD);
2225  llvm::Value *BufferPtr =
2226  Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2227  LValue GlobLVal = CGF.EmitLValueForField(
2228  CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2229  Address GlobAddr = GlobLVal.getAddress();
2230  GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2231  CGF.ConvertTypeForMem(Private->getType()),
2232  GlobAddr.getAlignment()));
2233  switch (CGF.getEvaluationKind(Private->getType())) {
2234  case TEK_Scalar: {
2235  llvm::Value *V = CGF.EmitLoadOfScalar(
2236  ElemPtr, /*Volatile=*/false, Private->getType(), Loc,
2238  CGF.EmitStoreOfScalar(V, GlobLVal);
2239  break;
2240  }
2241  case TEK_Complex: {
2243  CGF.MakeAddrLValue(ElemPtr, Private->getType()), Loc);
2244  CGF.EmitStoreOfComplex(V, GlobLVal, /*isInit=*/false);
2245  break;
2246  }
2247  case TEK_Aggregate:
2248  CGF.EmitAggregateCopy(GlobLVal,
2249  CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2251  break;
2252  }
2253  ++Idx;
2254  }
2255 
2256  CGF.FinishFunction();
2257  return Fn;
2258 }
2259 
2260 /// This function emits a helper that reduces all the reduction variables from
2261 /// the team into the provided global buffer for the reduction variables.
2262 ///
2263 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
2264 /// void *GlobPtrs[];
2265 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2266 /// ...
2267 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2268 /// reduce_function(GlobPtrs, reduce_data);
2270  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2271  QualType ReductionArrayTy, SourceLocation Loc,
2272  const RecordDecl *TeamReductionRec,
2273  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2274  &VarFieldMap,
2275  llvm::Function *ReduceFn) {
2276  ASTContext &C = CGM.getContext();
2277 
2278  // Buffer: global reduction buffer.
2279  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2280  C.VoidPtrTy, ImplicitParamKind::Other);
2281  // Idx: index of the buffer.
2282  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2284  // ReduceList: thread local Reduce list.
2285  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2286  C.VoidPtrTy, ImplicitParamKind::Other);
2287  FunctionArgList Args;
2288  Args.push_back(&BufferArg);
2289  Args.push_back(&IdxArg);
2290  Args.push_back(&ReduceListArg);
2291 
2292  const CGFunctionInfo &CGFI =
2293  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2294  auto *Fn = llvm::Function::Create(
2295  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2296  "_omp_reduction_list_to_global_reduce_func", &CGM.getModule());
2297  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2298  Fn->setDoesNotRecurse();
2299  CodeGenFunction CGF(CGM);
2300  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2301 
2302  CGBuilderTy &Bld = CGF.Builder;
2303 
2304  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2305  QualType StaticTy = C.getRecordType(TeamReductionRec);
2306  llvm::Type *LLVMReductionsBufferTy =
2307  CGM.getTypes().ConvertTypeForMem(StaticTy);
2308  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2309  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2310  LLVMReductionsBufferTy->getPointerTo());
2311 
2312  // 1. Build a list of reduction variables.
2313  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2314  RawAddress ReductionList =
2315  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2316  auto IPriv = Privates.begin();
2317  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2318  /*Volatile=*/false, C.IntTy,
2319  Loc)};
2320  unsigned Idx = 0;
2321  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2322  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2323  // Global = Buffer.VD[Idx];
2324  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2325  const FieldDecl *FD = VarFieldMap.lookup(VD);
2326  llvm::Value *BufferPtr =
2327  Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2328  LValue GlobLVal = CGF.EmitLValueForField(
2329  CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2330  Address GlobAddr = GlobLVal.getAddress();
2331  CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2332  /*Volatile=*/false, C.VoidPtrTy);
2333  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2334  // Store array size.
2335  ++Idx;
2336  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2337  llvm::Value *Size = CGF.Builder.CreateIntCast(
2338  CGF.getVLASize(
2339  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2340  .NumElts,
2341  CGF.SizeTy, /*isSigned=*/false);
2342  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2343  Elem);
2344  }
2345  }
2346 
2347  // Call reduce_function(GlobalReduceList, ReduceList)
2348  llvm::Value *GlobalReduceList = ReductionList.getPointer();
2349  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2350  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2351  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2353  CGF, Loc, ReduceFn, {GlobalReduceList, ReducedPtr});
2354  CGF.FinishFunction();
2355  return Fn;
2356 }
2357 
2358 /// This function emits a helper that copies all the reduction variables from
2359 /// the team into the provided global buffer for the reduction variables.
2360 ///
2361 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
2362 /// For all data entries D in reduce_data:
2363 /// Copy buffer.D[Idx] to local D;
2364 static llvm::Value *emitGlobalToListCopyFunction(
2365  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2366  QualType ReductionArrayTy, SourceLocation Loc,
2367  const RecordDecl *TeamReductionRec,
2368  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2369  &VarFieldMap) {
2370  ASTContext &C = CGM.getContext();
2371 
2372  // Buffer: global reduction buffer.
2373  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2374  C.VoidPtrTy, ImplicitParamKind::Other);
2375  // Idx: index of the buffer.
2376  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2378  // ReduceList: thread local Reduce list.
2379  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2380  C.VoidPtrTy, ImplicitParamKind::Other);
2381  FunctionArgList Args;
2382  Args.push_back(&BufferArg);
2383  Args.push_back(&IdxArg);
2384  Args.push_back(&ReduceListArg);
2385 
2386  const CGFunctionInfo &CGFI =
2387  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2388  auto *Fn = llvm::Function::Create(
2389  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2390  "_omp_reduction_global_to_list_copy_func", &CGM.getModule());
2391  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2392  Fn->setDoesNotRecurse();
2393  CodeGenFunction CGF(CGM);
2394  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2395 
2396  CGBuilderTy &Bld = CGF.Builder;
2397 
2398  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2399  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2400  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
2401  Address LocalReduceList(
2403  CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2404  C.VoidPtrTy, Loc),
2405  ElemTy->getPointerTo()),
2406  ElemTy, CGF.getPointerAlign());
2407  QualType StaticTy = C.getRecordType(TeamReductionRec);
2408  llvm::Type *LLVMReductionsBufferTy =
2409  CGM.getTypes().ConvertTypeForMem(StaticTy);
2410  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2411  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2412  LLVMReductionsBufferTy->getPointerTo());
2413 
2414  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2415  /*Volatile=*/false, C.IntTy,
2416  Loc)};
2417  unsigned Idx = 0;
2418  for (const Expr *Private : Privates) {
2419  // Reduce element = LocalReduceList[i]
2420  Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
2421  llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2422  ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2423  // elemptr = ((CopyType*)(elemptrptr)) + I
2424  ElemTy = CGF.ConvertTypeForMem(Private->getType());
2425  ElemPtrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2426  ElemPtrPtr, ElemTy->getPointerTo());
2427  Address ElemPtr =
2428  Address(ElemPtrPtr, ElemTy, C.getTypeAlignInChars(Private->getType()));
2429  const ValueDecl *VD = cast<DeclRefExpr>(Private)->getDecl();
2430  // Global = Buffer.VD[Idx];
2431  const FieldDecl *FD = VarFieldMap.lookup(VD);
2432  llvm::Value *BufferPtr =
2433  Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2434  LValue GlobLVal = CGF.EmitLValueForField(
2435  CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2436  Address GlobAddr = GlobLVal.getAddress();
2437  GlobLVal.setAddress(Address(GlobAddr.emitRawPointer(CGF),
2438  CGF.ConvertTypeForMem(Private->getType()),
2439  GlobAddr.getAlignment()));
2440  switch (CGF.getEvaluationKind(Private->getType())) {
2441  case TEK_Scalar: {
2442  llvm::Value *V = CGF.EmitLoadOfScalar(GlobLVal, Loc);
2443  CGF.EmitStoreOfScalar(V, ElemPtr, /*Volatile=*/false, Private->getType(),
2445  TBAAAccessInfo());
2446  break;
2447  }
2448  case TEK_Complex: {
2450  CGF.EmitStoreOfComplex(V, CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2451  /*isInit=*/false);
2452  break;
2453  }
2454  case TEK_Aggregate:
2455  CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
2456  GlobLVal, Private->getType(),
2458  break;
2459  }
2460  ++Idx;
2461  }
2462 
2463  CGF.FinishFunction();
2464  return Fn;
2465 }
2466 
2467 /// This function emits a helper that reduces all the reduction variables from
2468 /// the team into the provided global buffer for the reduction variables.
2469 ///
2470 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
2471 /// void *GlobPtrs[];
2472 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
2473 /// ...
2474 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
2475 /// reduce_function(reduce_data, GlobPtrs);
2477  CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2478  QualType ReductionArrayTy, SourceLocation Loc,
2479  const RecordDecl *TeamReductionRec,
2480  const llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2481  &VarFieldMap,
2482  llvm::Function *ReduceFn) {
2483  ASTContext &C = CGM.getContext();
2484 
2485  // Buffer: global reduction buffer.
2486  ImplicitParamDecl BufferArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2487  C.VoidPtrTy, ImplicitParamKind::Other);
2488  // Idx: index of the buffer.
2489  ImplicitParamDecl IdxArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.IntTy,
2491  // ReduceList: thread local Reduce list.
2492  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2493  C.VoidPtrTy, ImplicitParamKind::Other);
2494  FunctionArgList Args;
2495  Args.push_back(&BufferArg);
2496  Args.push_back(&IdxArg);
2497  Args.push_back(&ReduceListArg);
2498 
2499  const CGFunctionInfo &CGFI =
2500  CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2501  auto *Fn = llvm::Function::Create(
2502  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2503  "_omp_reduction_global_to_list_reduce_func", &CGM.getModule());
2504  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
2505  Fn->setDoesNotRecurse();
2506  CodeGenFunction CGF(CGM);
2507  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
2508 
2509  CGBuilderTy &Bld = CGF.Builder;
2510 
2511  Address AddrBufferArg = CGF.GetAddrOfLocalVar(&BufferArg);
2512  QualType StaticTy = C.getRecordType(TeamReductionRec);
2513  llvm::Type *LLVMReductionsBufferTy =
2514  CGM.getTypes().ConvertTypeForMem(StaticTy);
2515  llvm::Value *BufferArrPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2516  CGF.EmitLoadOfScalar(AddrBufferArg, /*Volatile=*/false, C.VoidPtrTy, Loc),
2517  LLVMReductionsBufferTy->getPointerTo());
2518 
2519  // 1. Build a list of reduction variables.
2520  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2521  Address ReductionList =
2522  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2523  auto IPriv = Privates.begin();
2524  llvm::Value *Idxs[] = {CGF.EmitLoadOfScalar(CGF.GetAddrOfLocalVar(&IdxArg),
2525  /*Volatile=*/false, C.IntTy,
2526  Loc)};
2527  unsigned Idx = 0;
2528  for (unsigned I = 0, E = Privates.size(); I < E; ++I, ++IPriv, ++Idx) {
2529  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2530  // Global = Buffer.VD[Idx];
2531  const ValueDecl *VD = cast<DeclRefExpr>(*IPriv)->getDecl();
2532  const FieldDecl *FD = VarFieldMap.lookup(VD);
2533  llvm::Value *BufferPtr =
2534  Bld.CreateInBoundsGEP(LLVMReductionsBufferTy, BufferArrPtr, Idxs);
2535  LValue GlobLVal = CGF.EmitLValueForField(
2536  CGF.MakeNaturalAlignRawAddrLValue(BufferPtr, StaticTy), FD);
2537  Address GlobAddr = GlobLVal.getAddress();
2538  CGF.EmitStoreOfScalar(GlobAddr.emitRawPointer(CGF), Elem,
2539  /*Volatile=*/false, C.VoidPtrTy);
2540  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2541  // Store array size.
2542  ++Idx;
2543  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2544  llvm::Value *Size = CGF.Builder.CreateIntCast(
2545  CGF.getVLASize(
2546  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2547  .NumElts,
2548  CGF.SizeTy, /*isSigned=*/false);
2549  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2550  Elem);
2551  }
2552  }
2553 
2554  // Call reduce_function(ReduceList, GlobalReduceList)
2555  llvm::Value *GlobalReduceList = ReductionList.emitRawPointer(CGF);
2556  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2557  llvm::Value *ReducedPtr = CGF.EmitLoadOfScalar(
2558  AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
2560  CGF, Loc, ReduceFn, {ReducedPtr, GlobalReduceList});
2561  CGF.FinishFunction();
2562  return Fn;
2563 }
2564 
2565 ///
2566 /// Design of OpenMP reductions on the GPU
2567 ///
2568 /// Consider a typical OpenMP program with one or more reduction
2569 /// clauses:
2570 ///
2571 /// float foo;
2572 /// double bar;
2573 /// #pragma omp target teams distribute parallel for \
2574 /// reduction(+:foo) reduction(*:bar)
2575 /// for (int i = 0; i < N; i++) {
2576 /// foo += A[i]; bar *= B[i];
2577 /// }
2578 ///
2579 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
2580 /// all teams. In our OpenMP implementation on the NVPTX device an
2581 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2582 /// within a team are mapped to CUDA threads within a threadblock.
2583 /// Our goal is to efficiently aggregate values across all OpenMP
2584 /// threads such that:
2585 ///
2586 /// - the compiler and runtime are logically concise, and
2587 /// - the reduction is performed efficiently in a hierarchical
2588 /// manner as follows: within OpenMP threads in the same warp,
2589 /// across warps in a threadblock, and finally across teams on
2590 /// the NVPTX device.
2591 ///
2592 /// Introduction to Decoupling
2593 ///
2594 /// We would like to decouple the compiler and the runtime so that the
2595 /// latter is ignorant of the reduction variables (number, data types)
2596 /// and the reduction operators. This allows a simpler interface
2597 /// and implementation while still attaining good performance.
2598 ///
2599 /// Pseudocode for the aforementioned OpenMP program generated by the
2600 /// compiler is as follows:
2601 ///
2602 /// 1. Create private copies of reduction variables on each OpenMP
2603 /// thread: 'foo_private', 'bar_private'
2604 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2605 /// to it and writes the result in 'foo_private' and 'bar_private'
2606 /// respectively.
2607 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
2608 /// and store the result on the team master:
2609 ///
2610 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2611 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2612 ///
2613 /// where:
2614 /// struct ReduceData {
2615 /// double *foo;
2616 /// double *bar;
2617 /// } reduceData
2618 /// reduceData.foo = &foo_private
2619 /// reduceData.bar = &bar_private
2620 ///
2621 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2622 /// auxiliary functions generated by the compiler that operate on
2623 /// variables of type 'ReduceData'. They aid the runtime perform
2624 /// algorithmic steps in a data agnostic manner.
2625 ///
2626 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2627 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2628 /// same warp. It takes the following arguments as input:
2629 ///
2630 /// a. variable of type 'ReduceData' on the calling lane,
2631 /// b. its lane_id,
2632 /// c. an offset relative to the current lane_id to generate a
2633 /// remote_lane_id. The remote lane contains the second
2634 /// variable of type 'ReduceData' that is to be reduced.
2635 /// d. an algorithm version parameter determining which reduction
2636 /// algorithm to use.
2637 ///
2638 /// 'shuffleReduceFn' retrieves data from the remote lane using
2639 /// efficient GPU shuffle intrinsics and reduces, using the
2640 /// algorithm specified by the 4th parameter, the two operands
2641 /// element-wise. The result is written to the first operand.
2642 ///
2643 /// Different reduction algorithms are implemented in different
2644 /// runtime functions, all calling 'shuffleReduceFn' to perform
2645 /// the essential reduction step. Therefore, based on the 4th
2646 /// parameter, this function behaves slightly differently to
2647 /// cooperate with the runtime to ensure correctness under
2648 /// different circumstances.
2649 ///
2650 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2651 /// reduced variables across warps. It tunnels, through CUDA
2652 /// shared memory, the thread-private data of type 'ReduceData'
2653 /// from lane 0 of each warp to a lane in the first warp.
2654 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2655 /// The last team writes the global reduced value to memory.
2656 ///
2657 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2658 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2659 /// scratchpadCopyFn, loadAndReduceFn)
2660 ///
2661 /// 'scratchpadCopyFn' is a helper that stores reduced
2662 /// data from the team master to a scratchpad array in
2663 /// global memory.
2664 ///
2665 /// 'loadAndReduceFn' is a helper that loads data from
2666 /// the scratchpad array and reduces it with the input
2667 /// operand.
2668 ///
2669 /// These compiler generated functions hide address
2670 /// calculation and alignment information from the runtime.
2671 /// 5. if ret == 1:
2672 /// The team master of the last team stores the reduced
2673 /// result to the globals in memory.
2674 /// foo += reduceData.foo; bar *= reduceData.bar
2675 ///
2676 ///
2677 /// Warp Reduction Algorithms
2678 ///
2679 /// On the warp level, we have three algorithms implemented in the
2680 /// OpenMP runtime depending on the number of active lanes:
2681 ///
2682 /// Full Warp Reduction
2683 ///
2684 /// The reduce algorithm within a warp where all lanes are active
2685 /// is implemented in the runtime as follows:
2686 ///
2687 /// full_warp_reduce(void *reduce_data,
2688 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2689 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2690 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2691 /// }
2692 ///
2693 /// The algorithm completes in log(2, WARPSIZE) steps.
2694 ///
2695 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2696 /// not used therefore we save instructions by not retrieving lane_id
2697 /// from the corresponding special registers. The 4th parameter, which
2698 /// represents the version of the algorithm being used, is set to 0 to
2699 /// signify full warp reduction.
2700 ///
2701 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2702 ///
2703 /// #reduce_elem refers to an element in the local lane's data structure
2704 /// #remote_elem is retrieved from a remote lane
2705 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2706 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2707 ///
2708 /// Contiguous Partial Warp Reduction
2709 ///
2710 /// This reduce algorithm is used within a warp where only the first
2711 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2712 /// number of OpenMP threads in a parallel region is not a multiple of
2713 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2714 ///
2715 /// void
2716 /// contiguous_partial_reduce(void *reduce_data,
2717 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2718 /// int size, int lane_id) {
2719 /// int curr_size;
2720 /// int offset;
2721 /// curr_size = size;
2722 /// mask = curr_size/2;
2723 /// while (offset>0) {
2724 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2725 /// curr_size = (curr_size+1)/2;
2726 /// offset = curr_size/2;
2727 /// }
2728 /// }
2729 ///
2730 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2731 ///
2732 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2733 /// if (lane_id < offset)
2734 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2735 /// else
2736 /// reduce_elem = remote_elem
2737 ///
2738 /// This algorithm assumes that the data to be reduced are located in a
2739 /// contiguous subset of lanes starting from the first. When there is
2740 /// an odd number of active lanes, the data in the last lane is not
2741 /// aggregated with any other lane's dat but is instead copied over.
2742 ///
2743 /// Dispersed Partial Warp Reduction
2744 ///
2745 /// This algorithm is used within a warp when any discontiguous subset of
2746 /// lanes are active. It is used to implement the reduction operation
2747 /// across lanes in an OpenMP simd region or in a nested parallel region.
2748 ///
2749 /// void
2750 /// dispersed_partial_reduce(void *reduce_data,
2751 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2752 /// int size, remote_id;
2753 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2754 /// do {
2755 /// remote_id = next_active_lane_id_right_after_me();
2756 /// # the above function returns 0 of no active lane
2757 /// # is present right after the current lane.
2758 /// size = number_of_active_lanes_in_this_warp();
2759 /// logical_lane_id /= 2;
2760 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2761 /// remote_id-1-threadIdx.x, 2);
2762 /// } while (logical_lane_id % 2 == 0 && size > 1);
2763 /// }
2764 ///
2765 /// There is no assumption made about the initial state of the reduction.
2766 /// Any number of lanes (>=1) could be active at any position. The reduction
2767 /// result is returned in the first active lane.
2768 ///
2769 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2770 ///
2771 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2772 /// if (lane_id % 2 == 0 && offset > 0)
2773 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2774 /// else
2775 /// reduce_elem = remote_elem
2776 ///
2777 ///
2778 /// Intra-Team Reduction
2779 ///
2780 /// This function, as implemented in the runtime call
2781 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2782 /// threads in a team. It first reduces within a warp using the
2783 /// aforementioned algorithms. We then proceed to gather all such
2784 /// reduced values at the first warp.
2785 ///
2786 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2787 /// data from each of the "warp master" (zeroth lane of each warp, where
2788 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2789 /// a mathematical sense) the problem of reduction across warp masters in
2790 /// a block to the problem of warp reduction.
2791 ///
2792 ///
2793 /// Inter-Team Reduction
2794 ///
2795 /// Once a team has reduced its data to a single value, it is stored in
2796 /// a global scratchpad array. Since each team has a distinct slot, this
2797 /// can be done without locking.
2798 ///
2799 /// The last team to write to the scratchpad array proceeds to reduce the
2800 /// scratchpad array. One or more workers in the last team use the helper
2801 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2802 /// the k'th worker reduces every k'th element.
2803 ///
2804 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2805 /// reduce across workers and compute a globally reduced value.
2806 ///
2810  ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2811  if (!CGF.HaveInsertPoint())
2812  return;
2813 
2814  bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
2815 #ifndef NDEBUG
2816  bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2817 #endif
2818 
2819  if (Options.SimpleReduction) {
2820  assert(!TeamsReduction && !ParallelReduction &&
2821  "Invalid reduction selection in emitReduction.");
2822  CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
2823  ReductionOps, Options);
2824  return;
2825  }
2826 
2827  assert((TeamsReduction || ParallelReduction) &&
2828  "Invalid reduction selection in emitReduction.");
2829 
2830  llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> VarFieldMap;
2831  llvm::SmallVector<const ValueDecl *, 4> PrivatesReductions(Privates.size());
2832  int Cnt = 0;
2833  for (const Expr *DRE : Privates) {
2834  PrivatesReductions[Cnt] = cast<DeclRefExpr>(DRE)->getDecl();
2835  ++Cnt;
2836  }
2837 
2838  ASTContext &C = CGM.getContext();
2839  const RecordDecl *ReductionRec = ::buildRecordForGlobalizedVars(
2840  CGM.getContext(), PrivatesReductions, std::nullopt, VarFieldMap, 1);
2841 
2842  // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2843  // RedList, shuffle_reduce_func, interwarp_copy_func);
2844  // or
2845  // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
2846  llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2847 
2848  llvm::Value *Res;
2849  // 1. Build a list of reduction variables.
2850  // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2851  auto Size = RHSExprs.size();
2852  for (const Expr *E : Privates) {
2853  if (E->getType()->isVariablyModifiedType())
2854  // Reserve place for array size.
2855  ++Size;
2856  }
2857  llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2858  QualType ReductionArrayTy = C.getConstantArrayType(
2859  C.VoidPtrTy, ArraySize, nullptr, ArraySizeModifier::Normal,
2860  /*IndexTypeQuals=*/0);
2861  Address ReductionList =
2862  CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2863  auto IPriv = Privates.begin();
2864  unsigned Idx = 0;
2865  for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2866  Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2867  CGF.Builder.CreateStore(
2869  CGF.EmitLValue(RHSExprs[I]).getPointer(CGF), CGF.VoidPtrTy),
2870  Elem);
2871  if ((*IPriv)->getType()->isVariablyModifiedType()) {
2872  // Store array size.
2873  ++Idx;
2874  Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx);
2875  llvm::Value *Size = CGF.Builder.CreateIntCast(
2876  CGF.getVLASize(
2877  CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2878  .NumElts,
2879  CGF.SizeTy, /*isSigned=*/false);
2880  CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2881  Elem);
2882  }
2883  }
2884 
2885  llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2886  ReductionList.emitRawPointer(CGF), CGF.VoidPtrTy);
2887  llvm::Function *ReductionFn = emitReductionFunction(
2888  CGF.CurFn->getName(), Loc, CGF.ConvertTypeForMem(ReductionArrayTy),
2889  Privates, LHSExprs, RHSExprs, ReductionOps);
2890  llvm::Value *ReductionDataSize =
2891  CGF.getTypeSize(C.getRecordType(ReductionRec));
2892  ReductionDataSize =
2893  CGF.Builder.CreateSExtOrTrunc(ReductionDataSize, CGF.Int64Ty);
2894  llvm::Function *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2895  CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
2896  llvm::Value *InterWarpCopyFn =
2897  emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
2898 
2899  if (ParallelReduction) {
2900  llvm::Value *Args[] = {RTLoc, ReductionDataSize, RL, ShuffleAndReduceFn,
2901  InterWarpCopyFn};
2902 
2903  Res = CGF.EmitRuntimeCall(
2904  OMPBuilder.getOrCreateRuntimeFunction(
2905  CGM.getModule(), OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2),
2906  Args);
2907  } else {
2908  assert(TeamsReduction && "expected teams reduction.");
2909  TeamsReductions.push_back(ReductionRec);
2910  auto *KernelTeamsReductionPtr = CGF.EmitRuntimeCall(
2911  OMPBuilder.getOrCreateRuntimeFunction(
2912  CGM.getModule(), OMPRTL___kmpc_reduction_get_fixed_buffer),
2913  {}, "_openmp_teams_reductions_buffer_$_$ptr");
2914  llvm::Value *GlobalToBufferCpyFn = ::emitListToGlobalCopyFunction(
2915  CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2916  llvm::Value *GlobalToBufferRedFn = ::emitListToGlobalReduceFunction(
2917  CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2918  ReductionFn);
2919  llvm::Value *BufferToGlobalCpyFn = ::emitGlobalToListCopyFunction(
2920  CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap);
2921  llvm::Value *BufferToGlobalRedFn = ::emitGlobalToListReduceFunction(
2922  CGM, Privates, ReductionArrayTy, Loc, ReductionRec, VarFieldMap,
2923  ReductionFn);
2924 
2925  llvm::Value *Args[] = {
2926  RTLoc,
2927  KernelTeamsReductionPtr,
2928  CGF.Builder.getInt32(C.getLangOpts().OpenMPCUDAReductionBufNum),
2929  ReductionDataSize,
2930  RL,
2931  ShuffleAndReduceFn,
2932  InterWarpCopyFn,
2933  GlobalToBufferCpyFn,
2934  GlobalToBufferRedFn,
2935  BufferToGlobalCpyFn,
2936  BufferToGlobalRedFn};
2937 
2938  Res = CGF.EmitRuntimeCall(
2939  OMPBuilder.getOrCreateRuntimeFunction(
2940  CGM.getModule(), OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2),
2941  Args);
2942  }
2943 
2944  // 5. Build if (res == 1)
2945  llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".omp.reduction.done");
2946  llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".omp.reduction.then");
2947  llvm::Value *Cond = CGF.Builder.CreateICmpEQ(
2948  Res, llvm::ConstantInt::get(CGM.Int32Ty, /*V=*/1));
2949  CGF.Builder.CreateCondBr(Cond, ThenBB, ExitBB);
2950 
2951  // 6. Build then branch: where we have reduced values in the master
2952  // thread in each team.
2953  // __kmpc_end_reduce{_nowait}(<gtid>);
2954  // break;
2955  CGF.EmitBlock(ThenBB);
2956 
2957  // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2958  auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
2959  this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2960  auto IPriv = Privates.begin();
2961  auto ILHS = LHSExprs.begin();
2962  auto IRHS = RHSExprs.begin();
2963  for (const Expr *E : ReductionOps) {
2964  emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2965  cast<DeclRefExpr>(*IRHS));
2966  ++IPriv;
2967  ++ILHS;
2968  ++IRHS;
2969  }
2970  };
2971  RegionCodeGenTy RCG(CodeGen);
2972  RCG(CGF);
2973  // There is no need to emit line number for unconditional branch.
2975  CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2976 }
2977 
2978 const VarDecl *
2980  const VarDecl *NativeParam) const {
2981  if (!NativeParam->getType()->isReferenceType())
2982  return NativeParam;
2983  QualType ArgType = NativeParam->getType();
2984  QualifierCollector QC;
2985  const Type *NonQualTy = QC.strip(ArgType);
2986  QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2987  if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2988  if (Attr->getCaptureKind() == OMPC_map) {
2989  PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2991  }
2992  }
2993  ArgType = CGM.getContext().getPointerType(PointeeTy);
2994  QC.addRestrict();
2995  enum { NVPTX_local_addr = 5 };
2996  QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
2997  ArgType = QC.apply(CGM.getContext(), ArgType);
2998  if (isa<ImplicitParamDecl>(NativeParam))
3000  CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3001  NativeParam->getIdentifier(), ArgType, ImplicitParamKind::Other);
3002  return ParmVarDecl::Create(
3003  CGM.getContext(),
3004  const_cast<DeclContext *>(NativeParam->getDeclContext()),
3005  NativeParam->getBeginLoc(), NativeParam->getLocation(),
3006  NativeParam->getIdentifier(), ArgType,
3007  /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
3008 }
3009 
3010 Address
3012  const VarDecl *NativeParam,
3013  const VarDecl *TargetParam) const {
3014  assert(NativeParam != TargetParam &&
3015  NativeParam->getType()->isReferenceType() &&
3016  "Native arg must not be the same as target arg.");
3017  Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3018  QualType NativeParamType = NativeParam->getType();
3019  QualifierCollector QC;
3020  const Type *NonQualTy = QC.strip(NativeParamType);
3021  QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3022  unsigned NativePointeeAddrSpace =
3023  CGF.getTypes().getTargetAddressSpace(NativePointeeTy);
3024  QualType TargetTy = TargetParam->getType();
3025  llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(LocalAddr, /*Volatile=*/false,
3026  TargetTy, SourceLocation());
3027  // Cast to native address space.
3029  TargetAddr,
3030  llvm::PointerType::get(CGF.getLLVMContext(), NativePointeeAddrSpace));
3031  Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3032  CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
3033  NativeParamType);
3034  return NativeParamAddr;
3035 }
3036 
3038  CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn,
3039  ArrayRef<llvm::Value *> Args) const {
3040  SmallVector<llvm::Value *, 4> TargetArgs;
3041  TargetArgs.reserve(Args.size());
3042  auto *FnType = OutlinedFn.getFunctionType();
3043  for (unsigned I = 0, E = Args.size(); I < E; ++I) {
3044  if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3045  TargetArgs.append(std::next(Args.begin(), I), Args.end());
3046  break;
3047  }
3048  llvm::Type *TargetType = FnType->getParamType(I);
3049  llvm::Value *NativeArg = Args[I];
3050  if (!TargetType->isPointerTy()) {
3051  TargetArgs.emplace_back(NativeArg);
3052  continue;
3053  }
3054  TargetArgs.emplace_back(
3055  CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(NativeArg, TargetType));
3056  }
3057  CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
3058 }
3059 
3060 /// Emit function which wraps the outline parallel region
3061 /// and controls the arguments which are passed to this function.
3062 /// The wrapper ensures that the outlined function is called
3063 /// with the correct arguments when data is shared.
3064 llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
3065  llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3066  ASTContext &Ctx = CGM.getContext();
3067  const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3068 
3069  // Create a function that takes as argument the source thread.
3070  FunctionArgList WrapperArgs;
3071  QualType Int16QTy =
3072  Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3073  QualType Int32QTy =
3074  Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
3075  ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3076  /*Id=*/nullptr, Int16QTy,
3078  ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
3079  /*Id=*/nullptr, Int32QTy,
3081  WrapperArgs.emplace_back(&ParallelLevelArg);
3082  WrapperArgs.emplace_back(&WrapperArg);
3083 
3084  const CGFunctionInfo &CGFI =
3086 
3087  auto *Fn = llvm::Function::Create(
3088  CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3089  Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
3090 
3091  // Ensure we do not inline the function. This is trivially true for the ones
3092  // passed to __kmpc_fork_call but the ones calles in serialized regions
3093  // could be inlined. This is not a perfect but it is closer to the invariant
3094  // we want, namely, every data environment starts with a new function.
3095  // TODO: We should pass the if condition to the runtime function and do the
3096  // handling there. Much cleaner code.
3097  Fn->addFnAttr(llvm::Attribute::NoInline);
3098 
3100  Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
3101  Fn->setDoesNotRecurse();
3102 
3103  CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3104  CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
3105  D.getBeginLoc(), D.getBeginLoc());
3106 
3107  const auto *RD = CS.getCapturedRecordDecl();
3108  auto CurField = RD->field_begin();
3109 
3110  Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
3111  /*Name=*/".zero.addr");
3112  CGF.Builder.CreateStore(CGF.Builder.getInt32(/*C*/ 0), ZeroAddr);
3113  // Get the array of arguments.
3115 
3116  Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).emitRawPointer(CGF));
3117  Args.emplace_back(ZeroAddr.emitRawPointer(CGF));
3118 
3119  CGBuilderTy &Bld = CGF.Builder;
3120  auto CI = CS.capture_begin();
3121 
3122  // Use global memory for data sharing.
3123  // Handle passing of global args to workers.
3124  RawAddress GlobalArgs =
3125  CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3126  llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3127  llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3128  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3129  CGM.getModule(), OMPRTL___kmpc_get_shared_variables),
3130  DataSharingArgs);
3131 
3132  // Retrieve the shared variables from the list of references returned
3133  // by the runtime. Pass the variables to the outlined function.
3134  Address SharedArgListAddress = Address::invalid();
3135  if (CS.capture_size() > 0 ||
3137  SharedArgListAddress = CGF.EmitLoadOfPointer(
3138  GlobalArgs, CGF.getContext()
3140  .castAs<PointerType>());
3141  }
3142  unsigned Idx = 0;
3144  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3145  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3146  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3147  llvm::Value *LB = CGF.EmitLoadOfScalar(
3148  TypedAddress,
3149  /*Volatile=*/false,
3151  cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3152  Args.emplace_back(LB);
3153  ++Idx;
3154  Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3155  TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3156  Src, CGF.SizeTy->getPointerTo(), CGF.SizeTy);
3157  llvm::Value *UB = CGF.EmitLoadOfScalar(
3158  TypedAddress,
3159  /*Volatile=*/false,
3161  cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3162  Args.emplace_back(UB);
3163  ++Idx;
3164  }
3165  if (CS.capture_size() > 0) {
3166  ASTContext &CGFContext = CGF.getContext();
3167  for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3168  QualType ElemTy = CurField->getType();
3169  Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3170  Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3171  Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)),
3172  CGF.ConvertTypeForMem(ElemTy));
3173  llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3174  /*Volatile=*/false,
3175  CGFContext.getPointerType(ElemTy),
3176  CI->getLocation());
3177  if (CI->capturesVariableByCopy() &&
3178  !CI->getCapturedVar()->getType()->isAnyPointerType()) {
3179  Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3180  CI->getLocation());
3181  }
3182  Args.emplace_back(Arg);
3183  }
3184  }
3185 
3186  emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
3187  CGF.FinishFunction();
3188  return Fn;
3189 }
3190 
3192  const Decl *D) {
3193  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3194  return;
3195 
3196  assert(D && "Expected function or captured|block decl.");
3197  assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3198  "Function is registered already.");
3199  assert((!TeamAndReductions.first || TeamAndReductions.first == D) &&
3200  "Team is set but not processed.");
3201  const Stmt *Body = nullptr;
3202  bool NeedToDelayGlobalization = false;
3203  if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3204  Body = FD->getBody();
3205  } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3206  Body = BD->getBody();
3207  } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3208  Body = CD->getBody();
3209  NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
3210  if (NeedToDelayGlobalization &&
3211  getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD)
3212  return;
3213  }
3214  if (!Body)
3215  return;
3216  CheckVarsEscapingDeclContext VarChecker(CGF, TeamAndReductions.second);
3217  VarChecker.Visit(Body);
3218  const RecordDecl *GlobalizedVarsRecord =
3219  VarChecker.getGlobalizedRecord(IsInTTDRegion);
3220  TeamAndReductions.first = nullptr;
3221  TeamAndReductions.second.clear();
3222  ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3223  VarChecker.getEscapedVariableLengthDecls();
3224  ArrayRef<const ValueDecl *> DelayedVariableLengthDecls =
3225  VarChecker.getDelayedVariableLengthDecls();
3226  if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty() &&
3227  DelayedVariableLengthDecls.empty())
3228  return;
3229  auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3230  I->getSecond().MappedParams =
3231  std::make_unique<CodeGenFunction::OMPMapVars>();
3232  I->getSecond().EscapedParameters.insert(
3233  VarChecker.getEscapedParameters().begin(),
3234  VarChecker.getEscapedParameters().end());
3235  I->getSecond().EscapedVariableLengthDecls.append(
3236  EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
3237  I->getSecond().DelayedVariableLengthDecls.append(
3238  DelayedVariableLengthDecls.begin(), DelayedVariableLengthDecls.end());
3239  DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
3240  for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
3241  assert(VD->isCanonicalDecl() && "Expected canonical declaration");
3242  Data.insert(std::make_pair(VD, MappedVarData()));
3243  }
3244  if (!NeedToDelayGlobalization) {
3245  emitGenericVarsProlog(CGF, D->getBeginLoc());
3246  struct GlobalizationScope final : EHScopeStack::Cleanup {
3247  GlobalizationScope() = default;
3248 
3249  void Emit(CodeGenFunction &CGF, Flags flags) override {
3250  static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime())
3251  .emitGenericVarsEpilog(CGF);
3252  }
3253  };
3254  CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
3255  }
3256 }
3257 
3259  const VarDecl *VD) {
3260  if (VD && VD->hasAttr<OMPAllocateDeclAttr>()) {
3261  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3262  auto AS = LangAS::Default;
3263  switch (A->getAllocatorType()) {
3264  // Use the default allocator here as by default local vars are
3265  // threadlocal.
3266  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3267  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3268  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3269  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3270  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3271  // Follow the user decision - use default allocation.
3272  return Address::invalid();
3273  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3274  // TODO: implement aupport for user-defined allocators.
3275  return Address::invalid();
3276  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3277  AS = LangAS::cuda_constant;
3278  break;
3279  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3280  AS = LangAS::cuda_shared;
3281  break;
3282  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3283  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3284  break;
3285  }
3286  llvm::Type *VarTy = CGF.ConvertTypeForMem(VD->getType());
3287  auto *GV = new llvm::GlobalVariable(
3288  CGM.getModule(), VarTy, /*isConstant=*/false,
3289  llvm::GlobalValue::InternalLinkage, llvm::PoisonValue::get(VarTy),
3290  VD->getName(),
3291  /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
3293  CharUnits Align = CGM.getContext().getDeclAlign(VD);
3294  GV->setAlignment(Align.getAsAlign());
3295  return Address(
3297  GV, VarTy->getPointerTo(CGM.getContext().getTargetAddressSpace(
3298  VD->getType().getAddressSpace()))),
3299  VarTy, Align);
3300  }
3301 
3302  if (getDataSharingMode() != CGOpenMPRuntimeGPU::DS_Generic)
3303  return Address::invalid();
3304 
3305  VD = VD->getCanonicalDecl();
3306  auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3307  if (I == FunctionGlobalizedDecls.end())
3308  return Address::invalid();
3309  auto VDI = I->getSecond().LocalVarData.find(VD);
3310  if (VDI != I->getSecond().LocalVarData.end())
3311  return VDI->second.PrivateAddr;
3312  if (VD->hasAttrs()) {
3314  E(VD->attr_end());
3315  IT != E; ++IT) {
3316  auto VDI = I->getSecond().LocalVarData.find(
3317  cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3318  ->getCanonicalDecl());
3319  if (VDI != I->getSecond().LocalVarData.end())
3320  return VDI->second.PrivateAddr;
3321  }
3322  }
3323 
3324  return Address::invalid();
3325 }
3326 
3328  FunctionGlobalizedDecls.erase(CGF.CurFn);
3330 }
3331 
3333  CodeGenFunction &CGF, const OMPLoopDirective &S,
3334  OpenMPDistScheduleClauseKind &ScheduleKind,
3335  llvm::Value *&Chunk) const {
3336  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
3337  if (getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD) {
3338  ScheduleKind = OMPC_DIST_SCHEDULE_static;
3339  Chunk = CGF.EmitScalarConversion(
3340  RT.getGPUNumThreads(CGF),
3341  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3342  S.getIterationVariable()->getType(), S.getBeginLoc());
3343  return;
3344  }
3346  CGF, S, ScheduleKind, Chunk);
3347 }
3348 
3350  CodeGenFunction &CGF, const OMPLoopDirective &S,
3351  OpenMPScheduleClauseKind &ScheduleKind,
3352  const Expr *&ChunkExpr) const {
3353  ScheduleKind = OMPC_SCHEDULE_static;
3354  // Chunk size is 1 in this case.
3355  llvm::APInt ChunkSize(32, 1);
3356  ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
3357  CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
3358  SourceLocation());
3359 }
3360 
3362  CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
3364  " Expected target-based directive.");
3365  const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
3366  for (const CapturedStmt::Capture &C : CS->captures()) {
3367  // Capture variables captured by reference in lambdas for target-based
3368  // directives.
3369  if (!C.capturesVariable())
3370  continue;
3371  const VarDecl *VD = C.getCapturedVar();
3372  const auto *RD = VD->getType()
3373  .getCanonicalType()
3375  ->getAsCXXRecordDecl();
3376  if (!RD || !RD->isLambda())
3377  continue;
3378  Address VDAddr = CGF.GetAddrOfLocalVar(VD);
3379  LValue VDLVal;
3380  if (VD->getType().getCanonicalType()->isReferenceType())
3381  VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
3382  else
3383  VDLVal = CGF.MakeAddrLValue(
3384  VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
3385  llvm::DenseMap<const ValueDecl *, FieldDecl *> Captures;
3386  FieldDecl *ThisCapture = nullptr;
3387  RD->getCaptureFields(Captures, ThisCapture);
3388  if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
3389  LValue ThisLVal =
3390  CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
3391  llvm::Value *CXXThis = CGF.LoadCXXThis();
3392  CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
3393  }
3394  for (const LambdaCapture &LC : RD->captures()) {
3395  if (LC.getCaptureKind() != LCK_ByRef)
3396  continue;
3397  const ValueDecl *VD = LC.getCapturedVar();
3398  // FIXME: For now VD is always a VarDecl because OpenMP does not support
3399  // capturing structured bindings in lambdas yet.
3400  if (!CS->capturesVariable(cast<VarDecl>(VD)))
3401  continue;
3402  auto It = Captures.find(VD);
3403  assert(It != Captures.end() && "Found lambda capture without field.");
3404  LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
3405  Address VDAddr = CGF.GetAddrOfLocalVar(cast<VarDecl>(VD));
3406  if (VD->getType().getCanonicalType()->isReferenceType())
3407  VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
3408  VD->getType().getCanonicalType())
3409  .getAddress();
3410  CGF.EmitStoreOfScalar(VDAddr.emitRawPointer(CGF), VarLVal);
3411  }
3412  }
3413 }
3414 
3416  LangAS &AS) {
3417  if (!VD || !VD->hasAttr<OMPAllocateDeclAttr>())
3418  return false;
3419  const auto *A = VD->getAttr<OMPAllocateDeclAttr>();
3420  switch(A->getAllocatorType()) {
3421  case OMPAllocateDeclAttr::OMPNullMemAlloc:
3422  case OMPAllocateDeclAttr::OMPDefaultMemAlloc:
3423  // Not supported, fallback to the default mem space.
3424  case OMPAllocateDeclAttr::OMPThreadMemAlloc:
3425  case OMPAllocateDeclAttr::OMPLargeCapMemAlloc:
3426  case OMPAllocateDeclAttr::OMPCGroupMemAlloc:
3427  case OMPAllocateDeclAttr::OMPHighBWMemAlloc:
3428  case OMPAllocateDeclAttr::OMPLowLatMemAlloc:
3429  AS = LangAS::Default;
3430  return true;
3431  case OMPAllocateDeclAttr::OMPConstMemAlloc:
3432  AS = LangAS::cuda_constant;
3433  return true;
3434  case OMPAllocateDeclAttr::OMPPTeamMemAlloc:
3435  AS = LangAS::cuda_shared;
3436  return true;
3437  case OMPAllocateDeclAttr::OMPUserDefinedMemAlloc:
3438  llvm_unreachable("Expected predefined allocator for the variables with the "
3439  "static storage.");
3440  }
3441  return false;
3442 }
3443 
3444 // Get current CudaArch and ignore any unknown values
3446  if (!CGM.getTarget().hasFeature("ptx"))
3447  return CudaArch::UNKNOWN;
3448  for (const auto &Feature : CGM.getTarget().getTargetOpts().FeatureMap) {
3449  if (Feature.getValue()) {
3450  CudaArch Arch = StringToCudaArch(Feature.getKey());
3451  if (Arch != CudaArch::UNKNOWN)
3452  return Arch;
3453  }
3454  }
3455  return CudaArch::UNKNOWN;
3456 }
3457 
3458 /// Check to see if target architecture supports unified addressing which is
3459 /// a restriction for OpenMP requires clause "unified_shared_memory".
3461  const OMPRequiresDecl *D) {
3462  for (const OMPClause *Clause : D->clauselists()) {
3463  if (Clause->getClauseKind() == OMPC_unified_shared_memory) {
3464  CudaArch Arch = getCudaArch(CGM);
3465  switch (Arch) {
3466  case CudaArch::SM_20:
3467  case CudaArch::SM_21:
3468  case CudaArch::SM_30:
3469  case CudaArch::SM_32_:
3470  case CudaArch::SM_35:
3471  case CudaArch::SM_37:
3472  case CudaArch::SM_50:
3473  case CudaArch::SM_52:
3474  case CudaArch::SM_53: {
3475  SmallString<256> Buffer;
3476  llvm::raw_svector_ostream Out(Buffer);
3477  Out << "Target architecture " << CudaArchToString(Arch)
3478  << " does not support unified addressing";
3479  CGM.Error(Clause->getBeginLoc(), Out.str());
3480  return;
3481  }
3482  case CudaArch::SM_60:
3483  case CudaArch::SM_61:
3484  case CudaArch::SM_62:
3485  case CudaArch::SM_70:
3486  case CudaArch::SM_72:
3487  case CudaArch::SM_75:
3488  case CudaArch::SM_80:
3489  case CudaArch::SM_86:
3490  case CudaArch::SM_87:
3491  case CudaArch::SM_89:
3492  case CudaArch::SM_90:
3493  case CudaArch::SM_90a:
3494  case CudaArch::GFX600:
3495  case CudaArch::GFX601:
3496  case CudaArch::GFX602:
3497  case CudaArch::GFX700:
3498  case CudaArch::GFX701:
3499  case CudaArch::GFX702:
3500  case CudaArch::GFX703:
3501  case CudaArch::GFX704:
3502  case CudaArch::GFX705:
3503  case CudaArch::GFX801:
3504  case CudaArch::GFX802:
3505  case CudaArch::GFX803:
3506  case CudaArch::GFX805:
3507  case CudaArch::GFX810:
3508  case CudaArch::GFX900:
3509  case CudaArch::GFX902:
3510  case CudaArch::GFX904:
3511  case CudaArch::GFX906:
3512  case CudaArch::GFX908:
3513  case CudaArch::GFX909:
3514  case CudaArch::GFX90a:
3515  case CudaArch::GFX90c:
3516  case CudaArch::GFX940:
3517  case CudaArch::GFX941:
3518  case CudaArch::GFX942:
3519  case CudaArch::GFX1010:
3520  case CudaArch::GFX1011:
3521  case CudaArch::GFX1012:
3522  case CudaArch::GFX1013:
3523  case CudaArch::GFX1030:
3524  case CudaArch::GFX1031:
3525  case CudaArch::GFX1032:
3526  case CudaArch::GFX1033:
3527  case CudaArch::GFX1034:
3528  case CudaArch::GFX1035:
3529  case CudaArch::GFX1036:
3530  case CudaArch::GFX1100:
3531  case CudaArch::GFX1101:
3532  case CudaArch::GFX1102:
3533  case CudaArch::GFX1103:
3534  case CudaArch::GFX1150:
3535  case CudaArch::GFX1151:
3536  case CudaArch::GFX1200:
3537  case CudaArch::GFX1201:
3538  case CudaArch::Generic:
3539  case CudaArch::UNUSED:
3540  case CudaArch::UNKNOWN:
3541  break;
3542  case CudaArch::LAST:
3543  llvm_unreachable("Unexpected Cuda arch.");
3544  }
3545  }
3546  }
3548 }
3549 
3551  CGBuilderTy &Bld = CGF.Builder;
3552  llvm::Module *M = &CGF.CGM.getModule();
3553  const char *LocSize = "__kmpc_get_hardware_num_threads_in_block";
3554  llvm::Function *F = M->getFunction(LocSize);
3555  if (!F) {
3556  F = llvm::Function::Create(
3557  llvm::FunctionType::get(CGF.Int32Ty, std::nullopt, false),
3558  llvm::GlobalVariable::ExternalLinkage, LocSize, &CGF.CGM.getModule());
3559  }
3560  return Bld.CreateCall(F, std::nullopt, "nvptx_num_threads");
3561 }
3562 
3564  ArrayRef<llvm::Value *> Args{};
3565  return CGF.EmitRuntimeCall(
3566  OMPBuilder.getOrCreateRuntimeFunction(
3567  CGM.getModule(), OMPRTL___kmpc_get_hardware_thread_id_in_block),
3568  Args);
3569 }
3570 
3572  ArrayRef<llvm::Value *> Args{};
3573  return CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
3574  CGM.getModule(), OMPRTL___kmpc_get_warp_size),
3575  Args);
3576 }
#define V(N, I)
Definition: ASTContext.h:3299
static char ID
Definition: Arena.cpp:183
static CudaArch getCudaArch(CodeGenModule &CGM)
static llvm::Value * emitGlobalToListCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
static llvm::Value * emitInterWarpCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc)
This function emits a helper that gathers Reduce lists from the first lane of every active warp to la...
static void getTeamsReductionVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of reduction variables from the teams ... directives.
static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, ArrayRef< const Expr * > Privates, Address SrcBase, Address DestBase, CopyOptionsTy CopyOptions={nullptr, nullptr, nullptr})
Emit instructions to copy a Reduce list, which contains partially aggregated values,...
static llvm::Value * getNVPTXLaneID(CodeGenFunction &CGF)
Get the id of the current lane in the Warp.
static void getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D, llvm::SmallVectorImpl< const ValueDecl * > &Vars)
Get list of lastprivate variables from the teams distribute ...
static llvm::Value * emitListToGlobalReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr, Address DestAddr, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
static bool hasNestedSPMDDirective(ASTContext &Ctx, const OMPExecutableDirective &D)
Check for inner (nested) SPMD construct, if any.
static llvm::Value * getNVPTXWarpID(CodeGenFunction &CGF)
Get the id of the warp in the block.
static bool supportsSPMDExecutionMode(ASTContext &Ctx, const OMPExecutableDirective &D)
static llvm::Value * createRuntimeShuffleFunction(CodeGenFunction &CGF, llvm::Value *Elem, QualType ElemType, llvm::Value *Offset, SourceLocation Loc)
This function creates calls to one of two shuffle functions to copy variables between lanes in a warp...
static llvm::Value * castValueToType(CodeGenFunction &CGF, llvm::Value *Val, QualType ValTy, QualType CastTy, SourceLocation Loc)
Cast value to the specified type.
static llvm::Value * emitGlobalToListReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap, llvm::Function *ReduceFn)
This function emits a helper that reduces all the reduction variables from the team into the provided...
static llvm::Function * emitShuffleAndReduceFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, llvm::Function *ReduceFn, SourceLocation Loc)
Emit a helper that reduces data across two OpenMP threads (lanes) in the same warp.
static llvm::Value * emitListToGlobalCopyFunction(CodeGenModule &CGM, ArrayRef< const Expr * > Privates, QualType ReductionArrayTy, SourceLocation Loc, const RecordDecl *TeamReductionRec, const llvm::SmallDenseMap< const ValueDecl *, const FieldDecl * > &VarFieldMap)
This function emits a helper that copies all the reduction variables from the team into the provided ...
This file defines OpenMP nodes for declarative directives.
unsigned Offset
Definition: Format.cpp:2978
This file defines OpenMP AST classes for clauses.
SourceLocation Loc
Definition: SemaObjC.cpp:755
static std::pair< ValueDecl *, bool > getPrivateItem(Sema &S, Expr *&RefExpr, SourceLocation &ELoc, SourceRange &ERange, bool AllowArraySection=false, StringRef DiagType="")
const char * Data
This file defines OpenMP AST classes for executable directives and clauses.
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:185
QualType getPointerType(QualType T) const
Return the uniqued reference to the type for a pointer to the specified type.
CanQualType VoidPtrTy
Definition: ASTContext.h:1121
QualType getUIntPtrType() const
Return a type compatible with "uintptr_t" (C99 7.18.1.4), as defined by the target.
QualType getIntTypeForBitwidth(unsigned DestWidth, unsigned Signed) const
getIntTypeForBitwidth - sets integer QualTy according to specified details: bitwidth,...
const VariableArrayType * getAsVariableArrayType(QualType T) const
Definition: ASTContext.h:2785
CanQualType getSizeType() const
Return the unique type for "size_t" (C99 7.17), defined in <stddef.h>.
CharUnits getDeclAlign(const Decl *D, bool ForAlignof=false) const
Return a conservative estimate of the alignment of the specified decl D.
int64_t toBits(CharUnits CharSize) const
Convert a size in characters to a size in bits.
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
CanQualType VoidTy
Definition: ASTContext.h:1094
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:760
QualType getAddrSpaceQualType(QualType T, LangAS AddressSpace) const
Return the uniqued reference to the type for an address space qualified type with the specified type ...
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:46
A class which contains all the information about a particular captured value.
Definition: Decl.h:4503
ArrayRef< Capture > captures() const
Definition: Decl.h:4624
BlockExpr - Adaptor class for mixing a BlockDecl with expressions.
Definition: Expr.h:6214
const BlockDecl * getBlockDecl() const
Definition: Expr.h:6226
CallExpr - Represents a function call (C99 6.5.2.2, C++ [expr.call]).
Definition: Expr.h:2872
arg_range arguments()
Definition: Expr.h:3111
Expr * getCallee()
Definition: Expr.h:3022
Describes the capture of either a variable, or 'this', or variable-length array type.
Definition: Stmt.h:3770
This captures a statement into a function.
Definition: Stmt.h:3757
CapturedDecl * getCapturedDecl()
Retrieve the outlined function declaration.
Definition: Stmt.cpp:1407
bool capturesVariable(const VarDecl *Var) const
True if this variable has been captured.
Definition: Stmt.cpp:1431
capture_range captures()
Definition: Stmt.h:3895
Stmt * getCapturedStmt()
Retrieve the statement being captured.
Definition: Stmt.h:3861
CastKind getCastKind() const
Definition: Expr.h:3579
Expr * getSubExpr()
Definition: Expr.h:3585
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
bool isZero() const
isZero - Test whether the quantity equals zero.
Definition: CharUnits.h:122
llvm::Align getAsAlign() const
getAsAlign - Returns Quantity as a valid llvm::Align, Beware llvm::Align assumes power of two 8-bit b...
Definition: CharUnits.h:189
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
static CharUnits fromQuantity(QuantityType Quantity)
fromQuantity - Construct a CharUnits quantity from a raw integer type.
Definition: CharUnits.h:63
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
static Address invalid()
Definition: Address.h:153
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220
CharUnits getAlignment() const
Definition: Address.h:166
llvm::Type * getElementType() const
Return the type of the values stored in this address.
Definition: Address.h:184
Address withElementType(llvm::Type *ElemTy) const
Return address with different element type, but same pointer and alignment.
Definition: Address.h:241
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176
static ApplyDebugLocation CreateEmpty(CodeGenFunction &CGF)
Set the IRBuilder to not attach debug locations.
Definition: CGDebugInfo.h:886
Address CreateGEP(CodeGenFunction &CGF, Address Addr, llvm::Value *Index, const llvm::Twine &Name="")
Definition: CGBuilder.h:292
Address CreatePointerBitCastOrAddrSpaceCast(Address Addr, llvm::Type *Ty, llvm::Type *ElementTy, const llvm::Twine &Name="")
Definition: CGBuilder.h:203
llvm::StoreInst * CreateStore(llvm::Value *Val, Address Addr, bool IsVolatile=false)
Definition: CGBuilder.h:136
Address CreateConstArrayGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = [n x T]* ...
Definition: CGBuilder.h:241
llvm::Value * CreateIsNull(Address Addr, const Twine &Name="")
Definition: CGBuilder.h:355
Address CreateConstGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:278
Address CreateConstInBoundsGEP(Address Addr, uint64_t Index, const llvm::Twine &Name="")
Given addr = T* ...
Definition: CGBuilder.h:261
Address CreateInBoundsGEP(Address Addr, ArrayRef< llvm::Value * > IdxList, llvm::Type *ElementType, CharUnits Align, const Twine &Name="")
Definition: CGBuilder.h:345
CGFunctionInfo - Class to encapsulate the information about a function definition.
void emitParallelCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars, const Expr *IfCond, llvm::Value *NumThreads) override
Emits code for parallel or serial call of the OutlinedFn with variables captured in a record which ad...
llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP teams.
void emitProcBindClause(CodeGenFunction &CGF, llvm::omp::ProcBindKind ProcBind, SourceLocation Loc) override
Emit call to void __kmpc_push_proc_bind(ident_t *loc, kmp_int32 global_tid, int proc_bind) to generat...
void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options) override
Emit a code for reduction clause.
DataSharingMode
Target codegen is specialized based on two data-sharing modes: CUDA, in which the local variables are...
@ DS_CUDA
CUDA data sharing mode.
@ DS_Generic
Generic data-sharing mode.
void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const override
Choose a default value for the dist_schedule clause.
Address getAddressOfLocalVariable(CodeGenFunction &CGF, const VarDecl *VD) override
Gets the OpenMP-specific address of the local variable.
void emitFunctionProlog(CodeGenFunction &CGF, const Decl *D) override
Emits OpenMP-specific function prolog.
void getDefaultScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPScheduleClauseKind &ScheduleKind, const Expr *&ChunkExpr) const override
Choose a default value for the schedule clause.
void emitNumTeamsClause(CodeGenFunction &CGF, const Expr *NumTeams, const Expr *ThreadLimit, SourceLocation Loc) override
This function ought to emit, in the general case, a call to.
void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr) override
Emits a critical region.
void emitTeamsCall(CodeGenFunction &CGF, const OMPExecutableDirective &D, SourceLocation Loc, llvm::Function *OutlinedFn, ArrayRef< llvm::Value * > CapturedVars) override
Emits code for teams call of the OutlinedFn with variables captured in a record which address is stor...
bool hasAllocateAttributeForGlobalVar(const VarDecl *VD, LangAS &AS) override
Checks if the variable has associated OMPAllocateDeclAttr attribute with the predefined allocator and...
void getKmpcFreeShared(CodeGenFunction &CGF, const std::pair< llvm::Value *, llvm::Value * > &AddrSizePair) override
Get call to __kmpc_free_shared.
llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) override
Emits inlined function for the specified OpenMP parallel.
void functionFinished(CodeGenFunction &CGF) override
Cleans up references to the objects in finished function.
llvm::Value * getGPUThreadID(CodeGenFunction &CGF)
Get the id of the current thread on the GPU.
llvm::Value * getGPUWarpSize(CodeGenFunction &CGF)
Get the GPU warp size.
void processRequiresDirective(const OMPRequiresDecl *D) override
Perform check on requires decl to ensure that target architecture supports unified addressing.
void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const override
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
bool isDelayedVariableLengthDecl(CodeGenFunction &CGF, const VarDecl *VD) const override
Declare generalized virtual functions which need to be defined by all specializations of OpenMPGPURun...
Address getParameterAddress(CodeGenFunction &CGF, const VarDecl *NativeParam, const VarDecl *TargetParam) const override
Gets the address of the native argument basing on the address of the target-specific parameter.
ExecutionMode
Defines the execution mode.
@ EM_Unknown
Unknown execution mode (orphaned directive).
@ EM_SPMD
SPMD execution mode (all threads are worker threads).
void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false) override
Emit an implicit/explicit barrier for OpenMP threads.
llvm::Value * getGPUNumThreads(CodeGenFunction &CGF)
Get the maximum number of threads in a block of the GPU.
const VarDecl * translateParameter(const FieldDecl *FD, const VarDecl *NativeParam) const override
Translates the native parameter of outlined function if this is required for target.
std::pair< llvm::Value *, llvm::Value * > getKmpcAllocShared(CodeGenFunction &CGF, const VarDecl *VD) override
Get call to __kmpc_alloc_shared.
bool isGPU() const override
Returns true if the current target is a GPU.
void emitNumThreadsClause(CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc) override
Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_threads)...
void adjustTargetSpecificDataForLambdas(CodeGenFunction &CGF, const OMPExecutableDirective &D) const override
Adjust some parameters for the target-based directives, like addresses of the variables captured by r...
virtual Address emitThreadIDAddress(CodeGenFunction &CGF, SourceLocation Loc)
Emits address of the word in a memory where current thread id is stored.
static const Stmt * getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body)
Checks if the Body is the CompoundStmt and returns its child statement iff there is only one that is ...
llvm::Value * emitUpdateLocation(CodeGenFunction &CGF, SourceLocation Loc, unsigned Flags=0, bool EmitLoc=false)
Emits object of ident_t type with info for source location.
llvm::OpenMPIRBuilder & getOMPBuilder()
virtual void functionFinished(CodeGenFunction &CGF)
Cleans up references to the objects in finished function.
virtual llvm::Function * emitTeamsOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP teams directive D.
llvm::OpenMPIRBuilder OMPBuilder
An OpenMP-IR-Builder instance.
bool hasRequiresUnifiedSharedMemory() const
Return whether the unified_shared_memory has been specified.
virtual void processRequiresDirective(const OMPRequiresDecl *D)
Perform check on requires decl to ensure that target architecture supports unified addressing.
llvm::Value * getThreadID(CodeGenFunction &CGF, SourceLocation Loc)
Gets thread id value for the current thread.
void clearLocThreadIdInsertPt(CodeGenFunction &CGF)
virtual void emitBarrierCall(CodeGenFunction &CGF, SourceLocation Loc, OpenMPDirectiveKind Kind, bool EmitChecks=true, bool ForceSimpleCall=false)
Emit an implicit/explicit barrier for OpenMP threads.
static unsigned getDefaultFlagsForBarriers(OpenMPDirectiveKind Kind)
Returns default flags for the barriers depending on the directive, for which this barier is going to ...
virtual llvm::Function * emitParallelOutlinedFunction(CodeGenFunction &CGF, const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen)
Emits outlined function for the specified OpenMP parallel directive D.
virtual void getDefaultDistScheduleAndChunk(CodeGenFunction &CGF, const OMPLoopDirective &S, OpenMPDistScheduleClauseKind &ScheduleKind, llvm::Value *&Chunk) const
Choose default schedule type and chunk value for the dist_schedule clause.
llvm::Type * getIdentTyPointerTy()
Returns pointer to ident_t type.
void emitSingleReductionCombiner(CodeGenFunction &CGF, const Expr *ReductionOp, const Expr *PrivateRef, const DeclRefExpr *LHS, const DeclRefExpr *RHS)
Emits single reduction combiner.
virtual void emitCriticalRegion(CodeGenFunction &CGF, StringRef CriticalName, const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc, const Expr *Hint=nullptr)
Emits a critical region.
virtual void emitReduction(CodeGenFunction &CGF, SourceLocation Loc, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps, ReductionOptionsTy Options)
Emit a code for reduction clause.
virtual void emitOutlinedFunctionCall(CodeGenFunction &CGF, SourceLocation Loc, llvm::FunctionCallee OutlinedFn, ArrayRef< llvm::Value * > Args=std::nullopt) const
Emits call of the outlined function with the provided arguments, translating these arguments to corre...
llvm::Function * emitReductionFunction(StringRef ReducerName, SourceLocation Loc, llvm::Type *ArgsElemType, ArrayRef< const Expr * > Privates, ArrayRef< const Expr * > LHSExprs, ArrayRef< const Expr * > RHSExprs, ArrayRef< const Expr * > ReductionOps)
Emits reduction function.
The scope used to remap some variables as private in the OpenMP loop body (or other captured region e...
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
LValue EmitLoadOfReferenceLValue(LValue RefLVal)
Definition: CGExpr.cpp:2788
LValue EmitLValueForField(LValue Base, const FieldDecl *Field)
Definition: CGExpr.cpp:4833
llvm::Type * ConvertType(QualType T)
CGCapturedStmtInfo * CapturedStmtInfo
llvm::Value * LoadCXXThis()
LoadCXXThis - Load the value of 'this'.
ComplexPairTy EmitLoadOfComplex(LValue src, SourceLocation loc)
EmitLoadOfComplex - Load a complex number from the specified l-value.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
llvm::LLVMContext & getLLVMContext()
Address EmitLoadOfPointer(Address Ptr, const PointerType *PtrTy, LValueBaseInfo *BaseInfo=nullptr, TBAAAccessInfo *TBAAInfo=nullptr)
Load a pointer with type PtrTy stored at address Ptr.
Definition: CGExpr.cpp:2797
LValue MakeNaturalAlignPointeeRawAddrLValue(llvm::Value *V, QualType T)
Same as MakeNaturalAlignPointeeAddrLValue except that the pointer is known to be unsigned.
void EmitAggregateCopy(LValue Dest, LValue Src, QualType EltTy, AggValueSlot::Overlap_t MayOverlap, bool isVolatile=false)
EmitAggregateCopy - Emit an aggregate copy.
Definition: CGExprAgg.cpp:2093
RawAddress CreateDefaultAlignTempAlloca(llvm::Type *Ty, const Twine &Name="tmp")
CreateDefaultAlignedTempAlloca - This creates an alloca with the default ABI alignment of the given L...
Definition: CGExpr.cpp:135
void StartFunction(GlobalDecl GD, QualType RetTy, llvm::Function *Fn, const CGFunctionInfo &FnInfo, const FunctionArgList &Args, SourceLocation Loc=SourceLocation(), SourceLocation StartLoc=SourceLocation())
Emit code for the start of a function.
llvm::Value * EvaluateExprAsBool(const Expr *E)
EvaluateExprAsBool - Perform the usual unary conversions on the specified expression and compare the ...
Definition: CGExpr.cpp:184
bool HaveInsertPoint() const
HaveInsertPoint - True if an insertion point is defined.
llvm::Value * getTypeSize(QualType Ty)
Returns calculated size of the specified type.
LValue EmitLValueForFieldInitialization(LValue Base, const FieldDecl *Field)
EmitLValueForFieldInitialization - Like EmitLValueForField, except that if the Field is a reference,...
Definition: CGExpr.cpp:5014
llvm::CallInst * EmitRuntimeCall(llvm::FunctionCallee callee, const Twine &name="")
const TargetInfo & getTarget() const
VlaSizePair getVLASize(const VariableArrayType *vla)
Returns an LLVM value that corresponds to the size, in non-variably-sized elements,...
llvm::Value * EmitLoadOfScalar(Address Addr, bool Volatile, QualType Ty, SourceLocation Loc, AlignmentSource Source=AlignmentSource::Type, bool isNontemporal=false)
EmitLoadOfScalar - Load a scalar value from an address, taking care to appropriately convert from the...
void EmitStoreOfComplex(ComplexPairTy V, LValue dest, bool isInit)
EmitStoreOfComplex - Store a complex number into the specified l-value.
llvm::Type * ConvertTypeForMem(QualType T)
static TypeEvaluationKind getEvaluationKind(QualType T)
getEvaluationKind - Return the TypeEvaluationKind of QualType T.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:598
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:147
LValue MakeAddrLValue(Address Addr, QualType T, AlignmentSource Source=AlignmentSource::Type)
void FinishFunction(SourceLocation EndLoc=SourceLocation())
FinishFunction - Complete IR generation of the current function.
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
std::pair< llvm::Value *, llvm::Value * > ComplexPairTy
LValue EmitLValue(const Expr *E, KnownNonNull_t IsKnownNonNull=NotKnownNonNull)
EmitLValue - Emit code to compute a designator that specifies the location of the expression.
Definition: CGExpr.cpp:1503
CodeGenTypes & getTypes() const
llvm::Value * EmitScalarConversion(llvm::Value *Src, QualType SrcTy, QualType DstTy, SourceLocation Loc)
Emit a conversion from the specified type to the specified destination type, both of which are LLVM s...
void EmitStoreOfScalar(llvm::Value *Value, Address Addr, bool Volatile, QualType Ty, AlignmentSource Source=AlignmentSource::Type, bool isInit=false, bool isNontemporal=false)
EmitStoreOfScalar - Store a scalar value to an address, taking care to appropriately convert from the...
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:578
LValue MakeNaturalAlignRawAddrLValue(llvm::Value *V, QualType T)
This class organizes the cross-function state that is used while generating LLVM code.
CGOpenMPRuntime & getOpenMPRuntime()
Return a reference to the configured OpenMP runtime.
const TargetInfo & getTarget() const
void SetInternalFunctionAttributes(GlobalDecl GD, llvm::Function *F, const CGFunctionInfo &FI)
Set the attributes on the LLVM function for the given decl and function info.
void addCompilerUsedGlobal(llvm::GlobalValue *GV)
Add a global to a list to be added to the llvm.compiler.used metadata.
llvm::Module & getModule() const
const LangOptions & getLangOpts() const
llvm::LLVMContext & getLLVMContext()
void Error(SourceLocation loc, StringRef error)
Emit a general error that something can't be done.
ASTContext & getContext() const
llvm::FunctionType * GetFunctionType(const CGFunctionInfo &Info)
GetFunctionType - Get the LLVM function type for.
Definition: CGCall.cpp:1641
const CGFunctionInfo & arrangeBuiltinFunctionDeclaration(QualType resultType, const FunctionArgList &args)
A builtin function is a freestanding function using the default C conventions.
Definition: CGCall.cpp:682
unsigned getTargetAddressSpace(QualType T) const
llvm::Type * ConvertTypeForMem(QualType T, bool ForBitField=false)
ConvertTypeForMem - Convert type T into a llvm::Type.
Information for lazily generating a cleanup.
Definition: EHScopeStack.h:141
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:351
LValue - This represents an lvalue references.
Definition: CGValue.h:181
Address getAddress() const
Definition: CGValue.h:370
llvm::Value * getPointer(CodeGenFunction &CGF) const
Definition: CGValue.h:361
void setAddress(Address address)
Definition: CGValue.h:372
A basic class for pre|post-action for advanced codegen sequence for OpenMP region.
An abstract representation of an aligned address.
Definition: Address.h:41
llvm::Value * getPointer() const
Definition: Address.h:65
Class provides a way to call simple version of codegen for OpenMP region, or an advanced with possibl...
void setAction(PrePostActionTy &Action) const
ConstStmtVisitor - This class implements a simple visitor for Stmt subclasses.
Definition: StmtVisitor.h:195
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436
void addDecl(Decl *D)
Add the declaration D into this context.
Definition: DeclBase.cpp:1716
A reference to a declared variable, function, enum, etc.
Definition: Expr.h:1260
ValueDecl * getDecl()
Definition: Expr.h:1328
DeclStmt - Adaptor class for mixing declarations with statements and expressions.
Definition: Stmt.h:1497
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
bool hasAttrs() const
Definition: DeclBase.h:524
attr_iterator attr_end() const
Definition: DeclBase.h:548
AttrVec & getAttrs()
Definition: DeclBase.h:530
bool isCanonicalDecl() const
Whether this particular Decl is a canonical one.
Definition: DeclBase.h:974
attr_iterator attr_begin() const
Definition: DeclBase.h:545
virtual Decl * getCanonicalDecl()
Retrieves the "canonical" declaration of the given declaration.
Definition: DeclBase.h:968
SourceLocation getLocation() const
Definition: DeclBase.h:445
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: DeclBase.h:437
bool hasAttr() const
Definition: DeclBase.h:583
T * getAttr() const
Definition: DeclBase.h:579
DeclContext * getDeclContext()
Definition: DeclBase.h:454
SourceLocation getBeginLoc() const LLVM_READONLY
Definition: Decl.h:823
This represents one expression.
Definition: Expr.h:110
Expr * IgnoreParenImpCasts() LLVM_READONLY
Skip past any parentheses and implicit casts which might surround this expression until reaching a fi...
Definition: Expr.cpp:3111
Expr * IgnoreParens() LLVM_READONLY
Skip past any parentheses which might surround this expression until reaching a fixed point.
Definition: Expr.cpp:3107
bool isLValue() const
isLValue - True if this expression is an "l-value" according to the rules of the current language.
Definition: Expr.h:277
QualType getType() const
Definition: Expr.h:142
Represents a member of a struct/union/class.
Definition: Decl.h:3060
static FieldDecl * Create(const ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, Expr *BW, bool Mutable, InClassInitStyle InitStyle)
Definition: Decl.cpp:4549
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
ImplicitCastExpr - Allows us to explicitly represent implicit type conversions, which have no direct ...
Definition: Expr.h:3707
static ImplicitParamDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation IdLoc, IdentifierInfo *Id, QualType T, ImplicitParamKind ParamKind)
Create implicit parameter.
Definition: Decl.cpp:5383
static IntegerLiteral * Create(const ASTContext &C, const llvm::APInt &V, QualType type, SourceLocation l)
Returns a new integer literal with value 'V' and type 'type'.
Definition: Expr.cpp:1032
Describes the capture of a variable or of this, or of a C++1y init-capture.
Definition: LambdaCapture.h:25
A C++ lambda expression, which produces a function object (of unspecified type) that can be invoked l...
Definition: ExprCXX.h:1950
bool isInitCapture(const LambdaCapture *Capture) const
Determine whether one of this lambda's captures is an init-capture.
Definition: ExprCXX.cpp:1290
capture_range captures() const
Retrieve this lambda's captures.
Definition: ExprCXX.cpp:1303
std::string OMPHostIRFile
Name of the IR file that contains the result of the OpenMP target host code generation.
Definition: LangOptions.h:560
StringRef getName() const
Get the name of identifier for this declaration as a StringRef.
Definition: Decl.h:276
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
This is a basic class for representing single OpenMP clause.
Definition: OpenMPClause.h:55
This is a basic class for representing single OpenMP executable directive.
Definition: StmtOpenMP.h:266
bool hasAssociatedStmt() const
Returns true if directive has associated statement.
Definition: StmtOpenMP.h:531
OpenMPDirectiveKind getDirectiveKind() const
Definition: StmtOpenMP.h:569
CapturedStmt * getInnermostCapturedStmt()
Get innermost captured statement for the construct.
Definition: StmtOpenMP.h:556
SourceLocation getBeginLoc() const
Returns starting location of directive kind.
Definition: StmtOpenMP.h:502
ArrayRef< OMPClause * > clauses() const
Definition: StmtOpenMP.h:586
static llvm::iterator_range< specific_clause_iterator< SpecificClause > > getClausesOfKind(ArrayRef< OMPClause * > Clauses)
Definition: StmtOpenMP.h:459
const CapturedStmt * getCapturedStmt(OpenMPDirectiveKind RegionKind) const
Returns the captured statement associated with the component region within the (combined) directive.
Definition: StmtOpenMP.h:547
static const SpecificClause * getSingleClause(ArrayRef< OMPClause * > Clauses)
Gets a single clause of the specified kind associated with the current directive iff there is only on...
Definition: StmtOpenMP.h:477
const Stmt * getAssociatedStmt() const
Returns statement associated with the directive.
Definition: StmtOpenMP.h:534
This represents clause 'lastprivate' in the '#pragma omp ...' directives.
This is a common base class for loop directives ('omp simd', 'omp for', 'omp for simd' etc....
Definition: StmtOpenMP.h:1018
This represents clause 'reduction' in the '#pragma omp ...' directives.
This represents '#pragma omp requires...' directive.
Definition: DeclOpenMP.h:417
clauselist_range clauselists()
Definition: DeclOpenMP.h:442
This represents 'ompx_bare' clause in the '#pragma omp target teams ...' directive.
static ParmVarDecl * Create(ASTContext &C, DeclContext *DC, SourceLocation StartLoc, SourceLocation IdLoc, const IdentifierInfo *Id, QualType T, TypeSourceInfo *TInfo, StorageClass S, Expr *DefArg)
Definition: Decl.cpp:2919
PointerType - C99 6.7.5.1 - Pointer Declarators.
Definition: Type.h:3151
A (possibly-)qualified type.
Definition: Type.h:940
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7497
QualType getNonReferenceType() const
If Type is a reference type (e.g., const int&), returns the type that the reference refers to ("const...
Definition: Type.h:7572
QualType getCanonicalType() const
Definition: Type.h:7423
A qualifier set is used to build a set of qualifiers.
Definition: Type.h:7311
const Type * strip(QualType type)
Collect any qualifiers on the given type and return an unqualified type.
Definition: Type.h:7318
QualType apply(const ASTContext &Context, QualType QT) const
Apply the collected qualifiers to the given type.
Definition: Type.cpp:4311
void addRestrict()
Definition: Type.h:466
void addAddressSpace(LangAS space, bool AllowDefaultAddrSpace=false)
Definition: Type.h:583
Represents a struct/union/class.
Definition: Decl.h:4171
virtual void completeDefinition()
Note that the definition of this type is now complete.
Definition: Decl.cpp:5085
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:84
child_range children()
Definition: Stmt.cpp:287
Stmt * IgnoreContainers(bool IgnoreCaptured=false)
Skip no-op (attributed, compound) container stmts and skip captured stmt at the top,...
Definition: Stmt.cpp:197
void startDefinition()
Starts the definition of this tag declaration.
Definition: Decl.cpp:4741
unsigned getNewAlign() const
Return the largest alignment for which a suitably-sized allocation with '::operator new(size_t)' is g...
Definition: TargetInfo.h:742
virtual const llvm::omp::GV & getGridValue() const
Definition: TargetInfo.h:1638
virtual bool hasFeature(StringRef Feature) const
Determine whether the given target has the given feature.
Definition: TargetInfo.h:1472
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:312
llvm::StringMap< bool > FeatureMap
The map of which features have been enabled disabled based on the command line.
Definition: TargetOptions.h:62
The base class of the type hierarchy.
Definition: Type.h:1813
CXXRecordDecl * getAsCXXRecordDecl() const
Retrieves the CXXRecordDecl that this type refers to, either because the type is a RecordType or beca...
Definition: Type.cpp:1881
bool isIntegerType() const
isIntegerType() does not include complex integers (a GCC extension).
Definition: Type.h:7979
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8227
bool isReferenceType() const
Definition: Type.h:7636
QualType getPointeeType() const
If this is a pointer, ObjC object pointer, or block pointer, this returns the respective pointee.
Definition: Type.cpp:705
bool isLValueReferenceType() const
Definition: Type.h:7640
bool hasSignedIntegerRepresentation() const
Determine whether this type has an signed integer representation of some sort, e.g....
Definition: Type.cpp:2185
bool isVariablyModifiedType() const
Whether this type is a variably-modified type (C99 6.7.5).
Definition: Type.h:2679
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2235
Opcode getOpcode() const
Definition: Expr.h:2275
Expr * getSubExpr() const
Definition: Expr.h:2280
Represent the declaration of a variable (in which case it is an lvalue) a function (in which case it ...
Definition: Decl.h:707
QualType getType() const
Definition: Decl.h:718
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.cpp:5375
Represents a variable declaration or definition.
Definition: Decl.h:919
VarDecl * getCanonicalDecl() override
Retrieves the "canonical" declaration of the given declaration.
Definition: Decl.cpp:2258
bool isInitCapture() const
Whether this variable is the implicit variable for a lambda init-capture.
Definition: Decl.h:1559
specific_attr_iterator - Iterates over a subrange of an AttrVec, only providing attributes that are o...
Definition: AttrIterator.h:33
@ Type
The l-value was considered opaque, so the alignment was determined from a type.
@ Decl
The l-value was an access to a declared entity or something equivalently strong, like the address of ...
DirectiveKind
Represents the kind of preprocessor directive or a module declaration that is tracked by the scanner ...
llvm::APInt APInt
Definition: Integral.h:29
The JSON file list parser is used to communicate input to InstallAPI.
@ Private
'private' clause, allowed on 'parallel', 'serial', 'loop', 'parallel loop', and 'serial loop' constru...
CudaArch
Definition: Cuda.h:54
llvm::omp::Directive OpenMPDirectiveKind
OpenMP directives.
Definition: OpenMPKinds.h:24
@ ICIS_NoInit
No in-class initializer.
Definition: Specifiers.h:269
bool isOpenMPDistributeDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a distribute directive.
@ LCK_ByRef
Capturing by reference.
Definition: Lambda.h:37
CudaArch StringToCudaArch(llvm::StringRef S)
Definition: Cuda.cpp:169
@ CR_OpenMP
Definition: CapturedStmt.h:19
bool isOpenMPParallelDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a parallel-kind directive.
bool isOpenMPPrivate(OpenMPClauseKind Kind)
Checks if the specified clause is one of private clauses like 'private', 'firstprivate',...
@ SC_None
Definition: Specifiers.h:247
OpenMPDistScheduleClauseKind
OpenMP attributes for 'dist_schedule' clause.
Definition: OpenMPKinds.h:103
bool isOpenMPTargetExecutionDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a target code offload directive.
bool isOpenMPTeamsDirective(OpenMPDirectiveKind DKind)
Checks if the specified directive is a teams-kind directive.
@ Union
The "union" keyword.
bool isOpenMPLoopBoundSharingDirective(OpenMPDirectiveKind Kind)
Checks if the specified directive kind is one of the composite or combined directives that need loop ...
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
void getOpenMPCaptureRegions(llvm::SmallVectorImpl< OpenMPDirectiveKind > &CaptureRegions, OpenMPDirectiveKind DKind)
Return the captured regions of an OpenMP directive.
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ CXXThis
Parameter for C++ 'this' argument.
@ Other
Other implicit parameter.
const char * CudaArchToString(CudaArch A)
Definition: Cuda.cpp:151
OpenMPScheduleClauseKind
OpenMP attributes for 'schedule' clause.
Definition: OpenMPKinds.h:30
@ AS_public
Definition: Specifiers.h:121
unsigned long uint64_t
llvm::Value * ScratchpadIndex
llvm::Value * ScratchpadWidth
llvm::Value * RemoteLaneOffset
llvm::IntegerType * Int8Ty
i8, i16, i32, and i64
llvm::IntegerType * IntTy
int