clang  19.0.0git
AMDGPU.cpp
Go to the documentation of this file.
1 //===- AMDGPU.cpp ---------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "ABIInfoImpl.h"
10 #include "TargetInfo.h"
12 
13 using namespace clang;
14 using namespace clang::CodeGen;
15 
16 //===----------------------------------------------------------------------===//
17 // AMDGPU ABI Implementation
18 //===----------------------------------------------------------------------===//
19 
20 namespace {
21 
22 class AMDGPUABIInfo final : public DefaultABIInfo {
23 private:
24  static const unsigned MaxNumRegsForArgsRet = 16;
25 
26  unsigned numRegsForType(QualType Ty) const;
27 
28  bool isHomogeneousAggregateBaseType(QualType Ty) const override;
29  bool isHomogeneousAggregateSmallEnough(const Type *Base,
30  uint64_t Members) const override;
31 
32  // Coerce HIP scalar pointer arguments from generic pointers to global ones.
33  llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,
34  unsigned ToAS) const {
35  // Single value types.
36  auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);
37  if (PtrTy && PtrTy->getAddressSpace() == FromAS)
38  return llvm::PointerType::get(Ty->getContext(), ToAS);
39  return Ty;
40  }
41 
42 public:
43  explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :
44  DefaultABIInfo(CGT) {}
45 
47  ABIArgInfo classifyKernelArgumentType(QualType Ty) const;
48  ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;
49 
50  void computeInfo(CGFunctionInfo &FI) const override;
51  Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
52  QualType Ty) const override;
53 };
54 
55 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {
56  return true;
57 }
58 
59 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(
60  const Type *Base, uint64_t Members) const {
61  uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;
62 
63  // Homogeneous Aggregates may occupy at most 16 registers.
64  return Members * NumRegs <= MaxNumRegsForArgsRet;
65 }
66 
67 /// Estimate number of registers the type will use when passed in registers.
68 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {
69  unsigned NumRegs = 0;
70 
71  if (const VectorType *VT = Ty->getAs<VectorType>()) {
72  // Compute from the number of elements. The reported size is based on the
73  // in-memory size, which includes the padding 4th element for 3-vectors.
74  QualType EltTy = VT->getElementType();
75  unsigned EltSize = getContext().getTypeSize(EltTy);
76 
77  // 16-bit element vectors should be passed as packed.
78  if (EltSize == 16)
79  return (VT->getNumElements() + 1) / 2;
80 
81  unsigned EltNumRegs = (EltSize + 31) / 32;
82  return EltNumRegs * VT->getNumElements();
83  }
84 
85  if (const RecordType *RT = Ty->getAs<RecordType>()) {
86  const RecordDecl *RD = RT->getDecl();
87  assert(!RD->hasFlexibleArrayMember());
88 
89  for (const FieldDecl *Field : RD->fields()) {
90  QualType FieldTy = Field->getType();
91  NumRegs += numRegsForType(FieldTy);
92  }
93 
94  return NumRegs;
95  }
96 
97  return (getContext().getTypeSize(Ty) + 31) / 32;
98 }
99 
100 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
102 
103  if (!getCXXABI().classifyReturnType(FI))
105 
106  unsigned NumRegsLeft = MaxNumRegsForArgsRet;
107  for (auto &Arg : FI.arguments()) {
108  if (CC == llvm::CallingConv::AMDGPU_KERNEL) {
109  Arg.info = classifyKernelArgumentType(Arg.type);
110  } else {
111  Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);
112  }
113  }
114 }
115 
116 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
117  QualType Ty) const {
118  llvm_unreachable("AMDGPU does not support varargs");
119 }
120 
122  if (isAggregateTypeForABI(RetTy)) {
123  // Records with non-trivial destructors/copy-constructors should not be
124  // returned by value.
125  if (!getRecordArgABI(RetTy, getCXXABI())) {
126  // Ignore empty structs/unions.
127  if (isEmptyRecord(getContext(), RetTy, true))
128  return ABIArgInfo::getIgnore();
129 
130  // Lower single-element structs to just return a regular value.
131  if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))
132  return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
133 
134  if (const RecordType *RT = RetTy->getAs<RecordType>()) {
135  const RecordDecl *RD = RT->getDecl();
136  if (RD->hasFlexibleArrayMember())
138  }
139 
140  // Pack aggregates <= 4 bytes into single VGPR or pair.
141  uint64_t Size = getContext().getTypeSize(RetTy);
142  if (Size <= 16)
143  return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
144 
145  if (Size <= 32)
146  return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
147 
148  if (Size <= 64) {
149  llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
150  return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
151  }
152 
153  if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)
154  return ABIArgInfo::getDirect();
155  }
156  }
157 
158  // Otherwise just do the default thing.
160 }
161 
162 /// For kernels all parameters are really passed in a special buffer. It doesn't
163 /// make sense to pass anything byval, so everything must be direct.
164 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {
166 
167  // TODO: Can we omit empty structs?
168 
169  if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
170  Ty = QualType(SeltTy, 0);
171 
172  llvm::Type *OrigLTy = CGT.ConvertType(Ty);
173  llvm::Type *LTy = OrigLTy;
174  if (getContext().getLangOpts().HIP) {
175  LTy = coerceKernelArgumentType(
176  OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),
177  /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));
178  }
179 
180  // FIXME: Should also use this for OpenCL, but it requires addressing the
181  // problem of kernels being called.
182  //
183  // FIXME: This doesn't apply the optimization of coercing pointers in structs
184  // to global address space when using byref. This would require implementing a
185  // new kind of coercion of the in-memory type when for indirect arguments.
186  if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&
187  isAggregateTypeForABI(Ty)) {
189  getContext().getTypeAlignInChars(Ty),
190  getContext().getTargetAddressSpace(LangAS::opencl_constant),
191  false /*Realign*/, nullptr /*Padding*/);
192  }
193 
194  // If we set CanBeFlattened to true, CodeGen will expand the struct to its
195  // individual elements, which confuses the Clover OpenCL backend; therefore we
196  // have to set it to false here. Other args of getDirect() are just defaults.
197  return ABIArgInfo::getDirect(LTy, 0, nullptr, false);
198 }
199 
201  unsigned &NumRegsLeft) const {
202  assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");
203 
205 
206  if (isAggregateTypeForABI(Ty)) {
207  // Records with non-trivial destructors/copy-constructors should not be
208  // passed by value.
209  if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
210  return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
211 
212  // Ignore empty structs/unions.
213  if (isEmptyRecord(getContext(), Ty, true))
214  return ABIArgInfo::getIgnore();
215 
216  // Lower single-element structs to just pass a regular value. TODO: We
217  // could do reasonable-size multiple-element structs too, using getExpand(),
218  // though watch out for things like bitfields.
219  if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))
220  return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));
221 
222  if (const RecordType *RT = Ty->getAs<RecordType>()) {
223  const RecordDecl *RD = RT->getDecl();
224  if (RD->hasFlexibleArrayMember())
226  }
227 
228  // Pack aggregates <= 8 bytes into single VGPR or pair.
229  uint64_t Size = getContext().getTypeSize(Ty);
230  if (Size <= 64) {
231  unsigned NumRegs = (Size + 31) / 32;
232  NumRegsLeft -= std::min(NumRegsLeft, NumRegs);
233 
234  if (Size <= 16)
235  return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));
236 
237  if (Size <= 32)
238  return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));
239 
240  // XXX: Should this be i64 instead, and should the limit increase?
241  llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());
242  return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));
243  }
244 
245  if (NumRegsLeft > 0) {
246  unsigned NumRegs = numRegsForType(Ty);
247  if (NumRegsLeft >= NumRegs) {
248  NumRegsLeft -= NumRegs;
249  return ABIArgInfo::getDirect();
250  }
251  }
252 
253  // Use pass-by-reference in stead of pass-by-value for struct arguments in
254  // function ABI.
256  getContext().getTypeAlignInChars(Ty),
257  getContext().getTargetAddressSpace(LangAS::opencl_private));
258  }
259 
260  // Otherwise just do the default thing.
262  if (!ArgInfo.isIndirect()) {
263  unsigned NumRegs = numRegsForType(Ty);
264  NumRegsLeft -= std::min(NumRegs, NumRegsLeft);
265  }
266 
267  return ArgInfo;
268 }
269 
270 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {
271 public:
272  AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)
273  : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}
274 
275  void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,
276  CodeGenModule &CGM) const;
277 
278  void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;
279 
280  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
281  CodeGen::CodeGenModule &M) const override;
282  unsigned getOpenCLKernelCallingConv() const override;
283 
284  llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
285  llvm::PointerType *T, QualType QT) const override;
286 
287  LangAS getASTAllocaAddressSpace() const override {
288  return getLangASFromTargetAS(
289  getABIInfo().getDataLayout().getAllocaAddrSpace());
290  }
291  LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,
292  const VarDecl *D) const override;
293  llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,
295  llvm::AtomicOrdering Ordering,
296  llvm::LLVMContext &Ctx) const override;
297  llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
298  llvm::Function *BlockInvokeFunc,
299  llvm::Type *BlockTy) const override;
300  bool shouldEmitStaticExternCAliases() const override;
301  bool shouldEmitDWARFBitFieldSeparators() const override;
302  void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;
303 
304 private:
305  // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
306  // resulting MDNode to the amdgcn.annotations MDNode.
307  static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,
308  int Operand);
309 };
310 }
311 
313  llvm::GlobalValue *GV) {
314  if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)
315  return false;
316 
317  return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&
318  (D->hasAttr<OpenCLKernelAttr>() ||
319  (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||
320  (isa<VarDecl>(D) &&
321  (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
322  cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||
323  cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));
324 }
325 
326 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(
327  const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {
328  const auto *ReqdWGS =
329  M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;
330  const bool IsOpenCLKernel =
331  M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();
332  const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();
333 
334  const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();
335  if (ReqdWGS || FlatWGS) {
336  M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);
337  } else if (IsOpenCLKernel || IsHIPKernel) {
338  // By default, restrict the maximum size to a value specified by
339  // --gpu-max-threads-per-block=n or its default value for HIP.
340  const unsigned OpenCLDefaultMaxWorkGroupSize = 256;
341  const unsigned DefaultMaxWorkGroupSize =
342  IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize
343  : M.getLangOpts().GPUMaxThreadsPerBlock;
344  std::string AttrVal =
345  std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);
346  F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
347  }
348 
349  if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())
351 
352  if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {
353  unsigned NumSGPR = Attr->getNumSGPR();
354 
355  if (NumSGPR != 0)
356  F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));
357  }
358 
359  if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {
360  uint32_t NumVGPR = Attr->getNumVGPR();
361 
362  if (NumVGPR != 0)
363  F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));
364  }
365 
366  if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {
367  uint32_t X = Attr->getMaxNumWorkGroupsX()
368  ->EvaluateKnownConstInt(M.getContext())
369  .getExtValue();
370  // Y and Z dimensions default to 1 if not specified
371  uint32_t Y = Attr->getMaxNumWorkGroupsY()
372  ? Attr->getMaxNumWorkGroupsY()
373  ->EvaluateKnownConstInt(M.getContext())
374  .getExtValue()
375  : 1;
376  uint32_t Z = Attr->getMaxNumWorkGroupsZ()
377  ? Attr->getMaxNumWorkGroupsZ()
378  ->EvaluateKnownConstInt(M.getContext())
379  .getExtValue()
380  : 1;
381 
382  llvm::SmallString<32> AttrVal;
383  llvm::raw_svector_ostream OS(AttrVal);
384  OS << X << ',' << Y << ',' << Z;
385 
386  F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());
387  }
388 }
389 
390 /// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,
391 /// Name, and Operand as operands, and adds the resulting MDNode to the
392 /// AnnotationName MDNode.
393 static void addAMDGCOrNVVMMetadata(const char *AnnotationName,
394  llvm::GlobalValue *GV, StringRef Name,
395  int Operand) {
396  llvm::Module *M = GV->getParent();
397  llvm::LLVMContext &Ctx = M->getContext();
398 
399  // Get annotations metadata node.
400  llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);
401 
402  llvm::Metadata *MDVals[] = {
403  llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
404  llvm::ConstantAsMetadata::get(
405  llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
406  // Append metadata to annotations node.
407  MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
408 }
409 
410 
411 void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,
412  StringRef Name, int Operand) {
413  addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);
414 }
415 
416 
417 /// Emits control constants used to change per-architecture behaviour in the
418 /// AMDGPU ROCm device libraries.
419 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(
420  CodeGen::CodeGenModule &CGM) const {
421  StringRef Name = "__oclc_ABI_version";
422  llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);
423  if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))
424  return;
425 
427  llvm::CodeObjectVersionKind::COV_None)
428  return;
429 
430  auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);
431  llvm::Constant *COV = llvm::ConstantInt::get(
433 
434  // It needs to be constant weak_odr without externally_initialized so that
435  // the load instuction can be eliminated by the IPSCCP.
436  auto *GV = new llvm::GlobalVariable(
437  CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,
438  nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
440  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
442 
443  // Replace any external references to this variable with the new global.
444  if (OriginalGV) {
445  OriginalGV->replaceAllUsesWith(GV);
446  GV->takeName(OriginalGV);
447  OriginalGV->eraseFromParent();
448  }
449 }
450 
451 void AMDGPUTargetCodeGenInfo::setTargetAttributes(
452  const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
454  GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);
455  GV->setDSOLocal(true);
456  }
457 
458  if (GV->isDeclaration())
459  return;
460 
461  llvm::Function *F = dyn_cast<llvm::Function>(GV);
462  if (!F)
463  return;
464 
465  const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
466  if (FD)
467  setFunctionDeclAttributes(FD, F, M);
468 
469  // Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.
470  const bool IsSYCLKernel =
471  FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();
472  if (IsSYCLKernel)
473  addAMDGCNMetadata(F, "kernel", 1);
474 
476  F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");
477 
478  if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
479  F->addFnAttr("amdgpu-ieee", "false");
480 }
481 
482 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {
483  return llvm::CallingConv::AMDGPU_KERNEL;
484 }
485 
486 // Currently LLVM assumes null pointers always have value 0,
487 // which results in incorrectly transformed IR. Therefore, instead of
488 // emitting null pointers in private and local address spaces, a null
489 // pointer in generic address space is emitted which is casted to a
490 // pointer in local or private address space.
491 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(
492  const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,
493  QualType QT) const {
494  if (CGM.getContext().getTargetNullPointerValue(QT) == 0)
495  return llvm::ConstantPointerNull::get(PT);
496 
497  auto &Ctx = CGM.getContext();
498  auto NPT = llvm::PointerType::get(
499  PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));
500  return llvm::ConstantExpr::getAddrSpaceCast(
501  llvm::ConstantPointerNull::get(NPT), PT);
502 }
503 
504 LangAS
505 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
506  const VarDecl *D) const {
507  assert(!CGM.getLangOpts().OpenCL &&
508  !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&
509  "Address space agnostic languages only");
510  LangAS DefaultGlobalAS = getLangASFromTargetAS(
512  if (!D)
513  return DefaultGlobalAS;
514 
515  LangAS AddrSpace = D->getType().getAddressSpace();
516  if (AddrSpace != LangAS::Default)
517  return AddrSpace;
518 
519  // Only promote to address space 4 if VarDecl has constant initialization.
520  if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&
522  if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())
523  return *ConstAS;
524  }
525  return DefaultGlobalAS;
526 }
527 
529 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
531  llvm::AtomicOrdering Ordering,
532  llvm::LLVMContext &Ctx) const {
533  std::string Name;
534  switch (Scope) {
537  Name = "singlethread";
538  break;
542  Name = "wavefront";
543  break;
547  Name = "workgroup";
548  break;
549  case SyncScope::HIPAgent:
552  Name = "agent";
553  break;
557  Name = "";
558  break;
559  }
560 
561  if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {
562  if (!Name.empty())
563  Name = Twine(Twine(Name) + Twine("-")).str();
564 
565  Name = Twine(Twine(Name) + Twine("one-as")).str();
566  }
567 
568  return Ctx.getOrInsertSyncScopeID(Name);
569 }
570 
571 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {
572  return false;
573 }
574 
575 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {
576  return true;
577 }
578 
580  const FunctionType *&FT) const {
581  FT = getABIInfo().getContext().adjustFunctionType(
583 }
584 
585 /// Create an OpenCL kernel for an enqueued block.
586 ///
587 /// The type of the first argument (the block literal) is the struct type
588 /// of the block literal instead of a pointer type. The first argument
589 /// (block literal) is passed directly by value to the kernel. The kernel
590 /// allocates the same type of struct on stack and stores the block literal
591 /// to it and passes its pointer to the block invoke function. The kernel
592 /// has "enqueued-block" function attribute and kernel argument metadata.
593 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
594  CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {
595  auto &Builder = CGF.Builder;
596  auto &C = CGF.getLLVMContext();
597 
598  auto *InvokeFT = Invoke->getFunctionType();
606 
607  ArgTys.push_back(BlockTy);
608  ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
609  AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));
610  ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));
611  ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
612  AccessQuals.push_back(llvm::MDString::get(C, "none"));
613  ArgNames.push_back(llvm::MDString::get(C, "block_literal"));
614  for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {
615  ArgTys.push_back(InvokeFT->getParamType(I));
616  ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));
617  AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));
618  AccessQuals.push_back(llvm::MDString::get(C, "none"));
619  ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));
620  ArgTypeQuals.push_back(llvm::MDString::get(C, ""));
621  ArgNames.push_back(
622  llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));
623  }
624  std::string Name = Invoke->getName().str() + "_kernel";
625  auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);
626  auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
627  &CGF.CGM.getModule());
628  F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
629 
630  llvm::AttrBuilder KernelAttrs(C);
631  // FIXME: The invoke isn't applying the right attributes either
632  // FIXME: This is missing setTargetAttributes
634  KernelAttrs.addAttribute("enqueued-block");
635  F->addFnAttrs(KernelAttrs);
636 
637  auto IP = CGF.Builder.saveIP();
638  auto *BB = llvm::BasicBlock::Create(C, "entry", F);
639  Builder.SetInsertPoint(BB);
640  const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);
641  auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);
642  BlockPtr->setAlignment(BlockAlign);
643  Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);
644  auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));
646  Args.push_back(Cast);
647  for (llvm::Argument &A : llvm::drop_begin(F->args()))
648  Args.push_back(&A);
649  llvm::CallInst *call = Builder.CreateCall(Invoke, Args);
650  call->setCallingConv(Invoke->getCallingConv());
651  Builder.CreateRetVoid();
652  Builder.restoreIP(IP);
653 
654  F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));
655  F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));
656  F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));
657  F->setMetadata("kernel_arg_base_type",
658  llvm::MDNode::get(C, ArgBaseTypeNames));
659  F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));
660  if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)
661  F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));
662 
663  return F;
664 }
665 
667  llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,
668  const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,
669  int32_t *MaxThreadsVal) {
670  unsigned Min = 0;
671  unsigned Max = 0;
672  if (FlatWGS) {
673  Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
674  Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();
675  }
676  if (ReqdWGS && Min == 0 && Max == 0)
677  Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();
678 
679  if (Min != 0) {
680  assert(Min <= Max && "Min must be less than or equal Max");
681 
682  if (MinThreadsVal)
683  *MinThreadsVal = Min;
684  if (MaxThreadsVal)
685  *MaxThreadsVal = Max;
686  std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);
687  if (F)
688  F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);
689  } else
690  assert(Max == 0 && "Max must be zero");
691 }
692 
694  llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {
695  unsigned Min =
696  Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();
697  unsigned Max =
698  Attr->getMax()
699  ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()
700  : 0;
701 
702  if (Min != 0) {
703  assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");
704 
705  std::string AttrVal = llvm::utostr(Min);
706  if (Max != 0)
707  AttrVal = AttrVal + "," + llvm::utostr(Max);
708  F->addFnAttr("amdgpu-waves-per-eu", AttrVal);
709  } else
710  assert(Max == 0 && "Max must be zero");
711 }
712 
713 std::unique_ptr<TargetCodeGenInfo>
715  return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());
716 }
static char ID
Definition: Arena.cpp:183
static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, const FunctionDecl *FD)
Set calling convention for CUDA/HIP kernel.
Definition: CGCall.cpp:295
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition: AMDGPU.cpp:312
static void addAMDGCOrNVVMMetadata(const char *AnnotationName, llvm::GlobalValue *GV, StringRef Name, int Operand)
Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV, Name, and Operand as operand...
Definition: AMDGPU.cpp:393
#define X(type, name)
Definition: Value.h:143
Defines the clang::TargetOptions class.
__DEVICE__ int min(int __a, int __b)
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:760
unsigned getTargetAddressSpace(LangAS AS) const
Attr - This represents one attribute.
Definition: Attr.h:46
ABIArgInfo - Helper class to encapsulate information about how a specific C type should be passed to ...
static ABIArgInfo getIgnore()
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition: CGCXXABI.h:158
CGFunctionInfo - Class to encapsulate the information about a function definition.
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
CanQualType getReturnType() const
MutableArrayRef< ArgInfo > arguments()
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::LLVMContext & getLLVMContext()
This class organizes the cross-function state that is used while generating LLVM code.
const TargetInfo & getTarget() const
const llvm::DataLayout & getDataLayout() const
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition: AMDGPU.cpp:693
llvm::Module & getModule() const
const LangOptions & getLangOpts() const
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition: AMDGPU.cpp:666
ASTContext & getContext() const
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition: CGCall.cpp:2217
const CodeGenOptions & getCodeGenOpts() const
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
Definition: CodeGenTypes.h:54
DefaultABIInfo - The default implementation for ABI specific details.
Definition: ABIInfoImpl.h:21
ABIArgInfo classifyArgumentType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:17
ABIArgInfo classifyReturnType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:45
TargetCodeGenInfo - This class organizes various target-specific codegeneration issues,...
Definition: TargetInfo.h:46
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
bool hasAttr() const
Definition: DeclBase.h:583
T * getAttr() const
Definition: DeclBase.h:579
Represents a member of a struct/union/class.
Definition: Decl.h:3060
Represents a function declaration or definition.
Definition: Decl.h:1972
ExtInfo withCallingConv(CallingConv cc) const
Definition: Type.h:4494
FunctionType - C99 6.7.5.3 - Function Declarators.
Definition: Type.h:4268
ExtInfo getExtInfo() const
Definition: Type.h:4597
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:482
A (possibly-)qualified type.
Definition: Type.h:940
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7497
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition: Type.h:1039
Represents a struct/union/class.
Definition: Decl.h:4171
bool hasFlexibleArrayMember() const
Definition: Decl.h:4204
field_range fields() const
Definition: Decl.h:4377
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5561
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41
bool allowAMDGPUUnsafeFPAtomics() const
Returns whether or not the AMDGPU unsafe floating point atomics are allowed.
Definition: TargetInfo.h:1048
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
Definition: TargetInfo.h:1632
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:312
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
Definition: TargetOptions.h:85
The base class of the type hierarchy.
Definition: Type.h:1813
const T * getAs() const
Member-template getAs<specific type>'.
Definition: Type.h:8160
QualType getType() const
Definition: Decl.h:718
Represents a variable declaration or definition.
Definition: Decl.h:919
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition: Decl.cpp:2628
Represents a GCC generic vector type.
Definition: Type.h:3981
ABIArgInfo classifyReturnType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to return a particular type.
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition: AMDGPU.cpp:714
bool isAggregateTypeForABI(QualType T)
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "single element struct", i.e.
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
bool Cast(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1717
The JSON file list parser is used to communicate input to InstallAPI.
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25
const FunctionProtoType * T
SyncScope
Defines synch scope values used internally by clang.
Definition: SyncScope.h:42
@ CC_OpenCLKernel
Definition: Specifiers.h:289
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86
@ HiddenVisibility
Objects with "hidden" visibility are not seen by the dynamic linker.
Definition: Visibility.h:37
@ ProtectedVisibility
Objects with "protected" visibility are seen by the dynamic linker but always dynamically resolve to ...
Definition: Visibility.h:42
unsigned long uint64_t
Definition: Format.h:5433