clang  19.0.0git
CGCUDANV.cpp
Go to the documentation of this file.
1 //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This provides a class for CUDA code generation targeting the NVIDIA CUDA
10 // runtime library.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "CGCUDARuntime.h"
15 #include "CGCXXABI.h"
16 #include "CodeGenFunction.h"
17 #include "CodeGenModule.h"
18 #include "clang/AST/Decl.h"
19 #include "clang/Basic/Cuda.h"
22 #include "llvm/Frontend/Offloading/Utility.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/ReplaceConstant.h"
27 #include "llvm/Support/Format.h"
28 #include "llvm/Support/VirtualFileSystem.h"
29 
30 using namespace clang;
31 using namespace CodeGen;
32 
33 namespace {
34 constexpr unsigned CudaFatMagic = 0x466243b1;
35 constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
36 
37 class CGNVCUDARuntime : public CGCUDARuntime {
38 
39 private:
40  llvm::IntegerType *IntTy, *SizeTy;
41  llvm::Type *VoidTy;
42  llvm::PointerType *PtrTy;
43 
44  /// Convenience reference to LLVM Context
45  llvm::LLVMContext &Context;
46  /// Convenience reference to the current module
47  llvm::Module &TheModule;
48  /// Keeps track of kernel launch stubs and handles emitted in this module
49  struct KernelInfo {
50  llvm::Function *Kernel; // stub function to help launch kernel
51  const Decl *D;
52  };
53  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
54  // Map a kernel mangled name to a symbol for identifying kernel in host code
55  // For CUDA, the symbol for identifying the kernel is the same as the device
56  // stub function. For HIP, they are different.
57  llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
58  // Map a kernel handle to the kernel stub.
59  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
60  struct VarInfo {
61  llvm::GlobalVariable *Var;
62  const VarDecl *D;
63  DeviceVarFlags Flags;
64  };
66  /// Keeps track of variable containing handle of GPU binary. Populated by
67  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
68  /// ModuleDtorFunction()
69  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
70  /// Whether we generate relocatable device code.
71  bool RelocatableDeviceCode;
72  /// Mangle context for device.
73  std::unique_ptr<MangleContext> DeviceMC;
74  /// Some zeros used for GEPs.
75  llvm::Constant *Zeros[2];
76 
77  llvm::FunctionCallee getSetupArgumentFn() const;
78  llvm::FunctionCallee getLaunchFn() const;
79 
80  llvm::FunctionType *getRegisterGlobalsFnTy() const;
81  llvm::FunctionType *getCallbackFnTy() const;
82  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
83  std::string addPrefixToName(StringRef FuncName) const;
84  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
85 
86  /// Creates a function to register all kernel stubs generated in this module.
87  llvm::Function *makeRegisterGlobalsFn();
88 
89  /// Helper function that generates a constant string and returns a pointer to
90  /// the start of the string. The result of this function can be used anywhere
91  /// where the C code specifies const char*.
92  llvm::Constant *makeConstantString(const std::string &Str,
93  const std::string &Name = "") {
94  auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95  return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96  ConstStr.getPointer(), Zeros);
97  }
98 
99  /// Helper function which generates an initialized constant array from Str,
100  /// and optionally sets section name and alignment. AddNull specifies whether
101  /// the array should nave NUL termination.
102  llvm::Constant *makeConstantArray(StringRef Str,
103  StringRef Name = "",
104  StringRef SectionName = "",
105  unsigned Alignment = 0,
106  bool AddNull = false) {
107  llvm::Constant *Value =
108  llvm::ConstantDataArray::getString(Context, Str, AddNull);
109  auto *GV = new llvm::GlobalVariable(
110  TheModule, Value->getType(), /*isConstant=*/true,
111  llvm::GlobalValue::PrivateLinkage, Value, Name);
112  if (!SectionName.empty()) {
113  GV->setSection(SectionName);
114  // Mark the address as used which make sure that this section isn't
115  // merged and we will really have it in the object file.
116  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
117  }
118  if (Alignment)
119  GV->setAlignment(llvm::Align(Alignment));
120  return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
121  }
122 
123  /// Helper function that generates an empty dummy function returning void.
124  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125  assert(FnTy->getReturnType()->isVoidTy() &&
126  "Can only generate dummy functions returning void!");
127  llvm::Function *DummyFunc = llvm::Function::Create(
128  FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
129 
130  llvm::BasicBlock *DummyBlock =
131  llvm::BasicBlock::Create(Context, "", DummyFunc);
132  CGBuilderTy FuncBuilder(CGM, Context);
133  FuncBuilder.SetInsertPoint(DummyBlock);
134  FuncBuilder.CreateRetVoid();
135 
136  return DummyFunc;
137  }
138 
139  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
140  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
141  std::string getDeviceSideName(const NamedDecl *ND) override;
142 
143  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
144  bool Extern, bool Constant) {
145  DeviceVars.push_back({&Var,
146  VD,
147  {DeviceVarFlags::Variable, Extern, Constant,
148  VD->hasAttr<HIPManagedAttr>(),
149  /*Normalized*/ false, 0}});
150  }
151  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
152  bool Extern, int Type) {
153  DeviceVars.push_back({&Var,
154  VD,
155  {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
156  /*Managed*/ false,
157  /*Normalized*/ false, Type}});
158  }
159  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
160  bool Extern, int Type, bool Normalized) {
161  DeviceVars.push_back({&Var,
162  VD,
163  {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
164  /*Managed*/ false, Normalized, Type}});
165  }
166 
167  /// Creates module constructor function
168  llvm::Function *makeModuleCtorFunction();
169  /// Creates module destructor function
170  llvm::Function *makeModuleDtorFunction();
171  /// Transform managed variables for device compilation.
172  void transformManagedVars();
173  /// Create offloading entries to register globals in RDC mode.
174  void createOffloadingEntries();
175 
176 public:
177  CGNVCUDARuntime(CodeGenModule &CGM);
178 
179  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
180  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
181  auto Loc = KernelStubs.find(Handle);
182  assert(Loc != KernelStubs.end());
183  return Loc->second;
184  }
185  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
186  void handleVarRegistration(const VarDecl *VD,
187  llvm::GlobalVariable &Var) override;
188  void
189  internalizeDeviceSideVar(const VarDecl *D,
190  llvm::GlobalValue::LinkageTypes &Linkage) override;
191 
192  llvm::Function *finalizeModule() override;
193 };
194 
195 } // end anonymous namespace
196 
197 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
198  if (CGM.getLangOpts().HIP)
199  return ((Twine("hip") + Twine(FuncName)).str());
200  return ((Twine("cuda") + Twine(FuncName)).str());
201 }
202 std::string
203 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
204  if (CGM.getLangOpts().HIP)
205  return ((Twine("__hip") + Twine(FuncName)).str());
206  return ((Twine("__cuda") + Twine(FuncName)).str());
207 }
208 
209 static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
210  // If the host and device have different C++ ABIs, mark it as the device
211  // mangle context so that the mangling needs to retrieve the additional
212  // device lambda mangling number instead of the regular host one.
213  if (CGM.getContext().getAuxTargetInfo() &&
216  return std::unique_ptr<MangleContext>(
218  *CGM.getContext().getAuxTargetInfo()));
219  }
220 
221  return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
222  CGM.getContext().getAuxTargetInfo()));
223 }
224 
225 CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
226  : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
227  TheModule(CGM.getModule()),
228  RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
229  DeviceMC(InitDeviceMC(CGM)) {
230  IntTy = CGM.IntTy;
231  SizeTy = CGM.SizeTy;
232  VoidTy = CGM.VoidTy;
233  Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
234  Zeros[1] = Zeros[0];
235  PtrTy = CGM.UnqualPtrTy;
236 }
237 
238 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
239  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
240  llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241  return CGM.CreateRuntimeFunction(
242  llvm::FunctionType::get(IntTy, Params, false),
243  addPrefixToName("SetupArgument"));
244 }
245 
246 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
247  if (CGM.getLangOpts().HIP) {
248  // hipError_t hipLaunchByPtr(char *);
249  return CGM.CreateRuntimeFunction(
250  llvm::FunctionType::get(IntTy, PtrTy, false), "hipLaunchByPtr");
251  }
252  // cudaError_t cudaLaunch(char *);
253  return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy, false),
254  "cudaLaunch");
255 }
256 
257 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
258  return llvm::FunctionType::get(VoidTy, PtrTy, false);
259 }
260 
261 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
262  return llvm::FunctionType::get(VoidTy, PtrTy, false);
263 }
264 
265 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
266  auto *CallbackFnTy = getCallbackFnTy();
267  auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
268  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), PtrTy,
269  PtrTy, CallbackFnTy->getPointerTo()};
270  return llvm::FunctionType::get(VoidTy, Params, false);
271 }
272 
273 std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
274  GlobalDecl GD;
275  // D could be either a kernel or a variable.
276  if (auto *FD = dyn_cast<FunctionDecl>(ND))
277  GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
278  else
279  GD = GlobalDecl(ND);
280  std::string DeviceSideName;
281  MangleContext *MC;
282  if (CGM.getLangOpts().CUDAIsDevice)
283  MC = &CGM.getCXXABI().getMangleContext();
284  else
285  MC = DeviceMC.get();
286  if (MC->shouldMangleDeclName(ND)) {
287  SmallString<256> Buffer;
288  llvm::raw_svector_ostream Out(Buffer);
289  MC->mangleName(GD, Out);
290  DeviceSideName = std::string(Out.str());
291  } else
292  DeviceSideName = std::string(ND->getIdentifier()->getName());
293 
294  // Make unique name for device side static file-scope variable for HIP.
295  if (CGM.getContext().shouldExternalize(ND) &&
296  CGM.getLangOpts().GPURelocatableDeviceCode) {
297  SmallString<256> Buffer;
298  llvm::raw_svector_ostream Out(Buffer);
299  Out << DeviceSideName;
300  CGM.printPostfixForExternalizedDecl(Out, ND);
301  DeviceSideName = std::string(Out.str());
302  }
303  return DeviceSideName;
304 }
305 
306 void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
307  FunctionArgList &Args) {
308  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
309  if (auto *GV =
310  dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn->getName()])) {
311  GV->setLinkage(CGF.CurFn->getLinkage());
312  GV->setInitializer(CGF.CurFn);
313  }
314  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
315  CudaFeature::CUDA_USES_NEW_LAUNCH) ||
316  (CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
317  emitDeviceStubBodyNew(CGF, Args);
318  else
319  emitDeviceStubBodyLegacy(CGF, Args);
320 }
321 
322 // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
323 // array and kernels are launched using cudaLaunchKernel().
324 void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
325  FunctionArgList &Args) {
326  // Build the shadow stack entry at the very start of the function.
327 
328  // Calculate amount of space we will need for all arguments. If we have no
329  // args, allocate a single pointer so we still have a valid pointer to the
330  // argument array that we can pass to runtime, even if it will be unused.
331  Address KernelArgs = CGF.CreateTempAlloca(
332  PtrTy, CharUnits::fromQuantity(16), "kernel_args",
333  llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
334  // Store pointers to the arguments in a locally allocated launch_args.
335  for (unsigned i = 0; i < Args.size(); ++i) {
336  llvm::Value *VarPtr = CGF.GetAddrOfLocalVar(Args[i]).emitRawPointer(CGF);
337  llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, PtrTy);
339  VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
340  PtrTy, KernelArgs.emitRawPointer(CGF), i));
341  }
342 
343  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
344 
345  // Lookup cudaLaunchKernel/hipLaunchKernel function.
346  // HIP kernel launching API name depends on -fgpu-default-stream option. For
347  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
348  // it is hipLaunchKernel_spt.
349  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
350  // void **args, size_t sharedMem,
351  // cudaStream_t stream);
352  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
353  // dim3 blockDim, void **args,
354  // size_t sharedMem, hipStream_t stream);
355  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
356  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
357  std::string KernelLaunchAPI = "LaunchKernel";
358  if (CGF.getLangOpts().GPUDefaultStream ==
359  LangOptions::GPUDefaultStreamKind::PerThread) {
360  if (CGF.getLangOpts().HIP)
361  KernelLaunchAPI = KernelLaunchAPI + "_spt";
362  else if (CGF.getLangOpts().CUDA)
363  KernelLaunchAPI = KernelLaunchAPI + "_ptsz";
364  }
365  auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
366  const IdentifierInfo &cudaLaunchKernelII =
367  CGM.getContext().Idents.get(LaunchKernelName);
368  FunctionDecl *cudaLaunchKernelFD = nullptr;
369  for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
370  if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
371  cudaLaunchKernelFD = FD;
372  }
373 
374  if (cudaLaunchKernelFD == nullptr) {
375  CGM.Error(CGF.CurFuncDecl->getLocation(),
376  "Can't find declaration for " + LaunchKernelName);
377  return;
378  }
379  // Create temporary dim3 grid_dim, block_dim.
380  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
381  QualType Dim3Ty = GridDimParam->getType();
382  Address GridDim =
383  CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
384  Address BlockDim =
385  CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
386  Address ShmemSize =
387  CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
388  Address Stream = CGF.CreateTempAlloca(PtrTy, CGM.getPointerAlign(), "stream");
389  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
390  llvm::FunctionType::get(IntTy,
391  {/*gridDim=*/GridDim.getType(),
392  /*blockDim=*/BlockDim.getType(),
393  /*ShmemSize=*/ShmemSize.getType(),
394  /*Stream=*/Stream.getType()},
395  /*isVarArg=*/false),
396  addUnderscoredPrefixToName("PopCallConfiguration"));
397 
398  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, {GridDim.emitRawPointer(CGF),
399  BlockDim.emitRawPointer(CGF),
400  ShmemSize.emitRawPointer(CGF),
401  Stream.emitRawPointer(CGF)});
402 
403  // Emit the call to cudaLaunch
404  llvm::Value *Kernel =
405  CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
406  CallArgList LaunchKernelArgs;
407  LaunchKernelArgs.add(RValue::get(Kernel),
408  cudaLaunchKernelFD->getParamDecl(0)->getType());
409  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
410  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
411  LaunchKernelArgs.add(RValue::get(KernelArgs, CGF),
412  cudaLaunchKernelFD->getParamDecl(3)->getType());
413  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
414  cudaLaunchKernelFD->getParamDecl(4)->getType());
415  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
416  cudaLaunchKernelFD->getParamDecl(5)->getType());
417 
418  QualType QT = cudaLaunchKernelFD->getType();
419  QualType CQT = QT.getCanonicalType();
420  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
421  llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
422 
423  const CGFunctionInfo &FI =
424  CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
425  llvm::FunctionCallee cudaLaunchKernelFn =
426  CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
427  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
428  LaunchKernelArgs);
429 
430  // To prevent CUDA device stub functions from being merged by ICF in MSVC
431  // environment, create an unique global variable for each kernel and write to
432  // the variable in the device stub.
433  if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
434  !CGF.getLangOpts().HIP) {
435  llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
436  std::string GlobalVarName = (KernelFunction->getName() + ".id").str();
437 
438  llvm::GlobalVariable *HandleVar =
439  CGM.getModule().getNamedGlobal(GlobalVarName);
440  if (!HandleVar) {
441  HandleVar = new llvm::GlobalVariable(
442  CGM.getModule(), CGM.Int8Ty,
443  /*Constant=*/false, KernelFunction->getLinkage(),
444  llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
445  HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
446  HandleVar->setVisibility(KernelFunction->getVisibility());
447  if (KernelFunction->hasComdat())
448  HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
449  }
450 
451  CGF.Builder.CreateAlignedStore(llvm::ConstantInt::get(CGM.Int8Ty, 1),
452  HandleVar, CharUnits::One(),
453  /*IsVolatile=*/true);
454  }
455 
456  CGF.EmitBranch(EndBlock);
457 
458  CGF.EmitBlock(EndBlock);
459 }
460 
461 void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
462  FunctionArgList &Args) {
463  // Emit a call to cudaSetupArgument for each arg in Args.
464  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
465  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
467  for (const VarDecl *A : Args) {
468  auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
469  Offset = Offset.alignTo(TInfo.Align);
470  llvm::Value *Args[] = {
471  CGF.Builder.CreatePointerCast(
472  CGF.GetAddrOfLocalVar(A).emitRawPointer(CGF), PtrTy),
473  llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
474  llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
475  };
476  llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
477  llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
478  llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
479  llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
480  CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
481  CGF.EmitBlock(NextBlock);
482  Offset += TInfo.Width;
483  }
484 
485  // Emit the call to cudaLaunch
486  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
487  llvm::Value *Arg =
488  CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn->getName()], PtrTy);
489  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
490  CGF.EmitBranch(EndBlock);
491 
492  CGF.EmitBlock(EndBlock);
493 }
494 
495 // Replace the original variable Var with the address loaded from variable
496 // ManagedVar populated by HIP runtime.
497 static void replaceManagedVar(llvm::GlobalVariable *Var,
498  llvm::GlobalVariable *ManagedVar) {
500  for (auto &&VarUse : Var->uses()) {
501  WorkList.push_back({VarUse.getUser()});
502  }
503  while (!WorkList.empty()) {
504  auto &&WorkItem = WorkList.pop_back_val();
505  auto *U = WorkItem.back();
506  if (isa<llvm::ConstantExpr>(U)) {
507  for (auto &&UU : U->uses()) {
508  WorkItem.push_back(UU.getUser());
509  WorkList.push_back(WorkItem);
510  WorkItem.pop_back();
511  }
512  continue;
513  }
514  if (auto *I = dyn_cast<llvm::Instruction>(U)) {
515  llvm::Value *OldV = Var;
516  llvm::Instruction *NewV =
517  new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
518  llvm::Align(Var->getAlignment()), I);
519  WorkItem.pop_back();
520  // Replace constant expressions directly or indirectly using the managed
521  // variable with instructions.
522  for (auto &&Op : WorkItem) {
523  auto *CE = cast<llvm::ConstantExpr>(Op);
524  auto *NewInst = CE->getAsInstruction();
525  NewInst->insertBefore(*I->getParent(), I->getIterator());
526  NewInst->replaceUsesOfWith(OldV, NewV);
527  OldV = CE;
528  NewV = NewInst;
529  }
530  I->replaceUsesOfWith(OldV, NewV);
531  } else {
532  llvm_unreachable("Invalid use of managed variable");
533  }
534  }
535 }
536 
537 /// Creates a function that sets up state on the host side for CUDA objects that
538 /// have a presence on both the host and device sides. Specifically, registers
539 /// the host side of kernel functions and device global variables with the CUDA
540 /// runtime.
541 /// \code
542 /// void __cuda_register_globals(void** GpuBinaryHandle) {
543 /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
544 /// ...
545 /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
546 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
547 /// ...
548 /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
549 /// }
550 /// \endcode
551 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
552  // No need to register anything
553  if (EmittedKernels.empty() && DeviceVars.empty())
554  return nullptr;
555 
556  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
557  getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
558  addUnderscoredPrefixToName("_register_globals"), &TheModule);
559  llvm::BasicBlock *EntryBB =
560  llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
561  CGBuilderTy Builder(CGM, Context);
562  Builder.SetInsertPoint(EntryBB);
563 
564  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
565  // int, uint3*, uint3*, dim3*, dim3*, int*)
566  llvm::Type *RegisterFuncParams[] = {
567  PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
568  PtrTy, PtrTy, PtrTy, PtrTy, IntTy->getPointerTo()};
569  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
570  llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
571  addUnderscoredPrefixToName("RegisterFunction"));
572 
573  // Extract GpuBinaryHandle passed as the first argument passed to
574  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
575  // each emitted kernel.
576  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
577  for (auto &&I : EmittedKernels) {
578  llvm::Constant *KernelName =
579  makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
580  llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
581  llvm::Value *Args[] = {
582  &GpuBinaryHandlePtr,
583  KernelHandles[I.Kernel->getName()],
584  KernelName,
585  KernelName,
586  llvm::ConstantInt::get(IntTy, -1),
587  NullPtr,
588  NullPtr,
589  NullPtr,
590  NullPtr,
591  llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
592  Builder.CreateCall(RegisterFunc, Args);
593  }
594 
595  llvm::Type *VarSizeTy = IntTy;
596  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
597  if (CGM.getLangOpts().HIP ||
598  ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
599  VarSizeTy = SizeTy;
600 
601  // void __cudaRegisterVar(void **, char *, char *, const char *,
602  // int, int, int, int)
603  llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
604  IntTy, VarSizeTy, IntTy, IntTy};
605  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
606  llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
607  addUnderscoredPrefixToName("RegisterVar"));
608  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
609  // size_t, unsigned)
610  llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
611  PtrTy, VarSizeTy, IntTy};
612  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
613  llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
614  addUnderscoredPrefixToName("RegisterManagedVar"));
615  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
616  // const void **, const char *, int, int);
617  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
618  llvm::FunctionType::get(
619  VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy}, false),
620  addUnderscoredPrefixToName("RegisterSurface"));
621  // void __cudaRegisterTexture(void **, const struct textureReference *,
622  // const void **, const char *, int, int, int)
623  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
624  llvm::FunctionType::get(
625  VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy}, false),
626  addUnderscoredPrefixToName("RegisterTexture"));
627  for (auto &&Info : DeviceVars) {
628  llvm::GlobalVariable *Var = Info.Var;
629  assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
630  "External variables should not show up here, except HIP managed "
631  "variables");
632  llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
633  switch (Info.Flags.getKind()) {
635  uint64_t VarSize =
636  CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
637  if (Info.Flags.isManaged()) {
638  assert(Var->getName().ends_with(".managed") &&
639  "HIP managed variables not transformed");
640  auto *ManagedVar = CGM.getModule().getNamedGlobal(
641  Var->getName().drop_back(StringRef(".managed").size()));
642  llvm::Value *Args[] = {
643  &GpuBinaryHandlePtr,
644  ManagedVar,
645  Var,
646  VarName,
647  llvm::ConstantInt::get(VarSizeTy, VarSize),
648  llvm::ConstantInt::get(IntTy, Var->getAlignment())};
649  if (!Var->isDeclaration())
650  Builder.CreateCall(RegisterManagedVar, Args);
651  } else {
652  llvm::Value *Args[] = {
653  &GpuBinaryHandlePtr,
654  Var,
655  VarName,
656  VarName,
657  llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
658  llvm::ConstantInt::get(VarSizeTy, VarSize),
659  llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
660  llvm::ConstantInt::get(IntTy, 0)};
661  Builder.CreateCall(RegisterVar, Args);
662  }
663  break;
664  }
665  case DeviceVarFlags::Surface:
666  Builder.CreateCall(
667  RegisterSurf,
668  {&GpuBinaryHandlePtr, Var, VarName, VarName,
669  llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
670  llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
671  break;
672  case DeviceVarFlags::Texture:
673  Builder.CreateCall(
674  RegisterTex,
675  {&GpuBinaryHandlePtr, Var, VarName, VarName,
676  llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
677  llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
678  llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
679  break;
680  }
681  }
682 
683  Builder.CreateRetVoid();
684  return RegisterKernelsFunc;
685 }
686 
687 /// Creates a global constructor function for the module:
688 ///
689 /// For CUDA:
690 /// \code
691 /// void __cuda_module_ctor() {
692 /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
693 /// __cuda_register_globals(Handle);
694 /// }
695 /// \endcode
696 ///
697 /// For HIP:
698 /// \code
699 /// void __hip_module_ctor() {
700 /// if (__hip_gpubin_handle == 0) {
701 /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob);
702 /// __hip_register_globals(__hip_gpubin_handle);
703 /// }
704 /// }
705 /// \endcode
706 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
707  bool IsHIP = CGM.getLangOpts().HIP;
708  bool IsCUDA = CGM.getLangOpts().CUDA;
709  // No need to generate ctors/dtors if there is no GPU binary.
710  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
711  if (CudaGpuBinaryFileName.empty() && !IsHIP)
712  return nullptr;
713  if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
714  DeviceVars.empty())
715  return nullptr;
716 
717  // void __{cuda|hip}_register_globals(void* handle);
718  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
719  // We always need a function to pass in as callback. Create a dummy
720  // implementation if we don't need to register anything.
721  if (RelocatableDeviceCode && !RegisterGlobalsFunc)
722  RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
723 
724  // void ** __{cuda|hip}RegisterFatBinary(void *);
725  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
726  llvm::FunctionType::get(PtrTy, PtrTy, false),
727  addUnderscoredPrefixToName("RegisterFatBinary"));
728  // struct { int magic, int version, void * gpu_binary, void * dont_care };
729  llvm::StructType *FatbinWrapperTy =
730  llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
731 
732  // Register GPU binary with the CUDA runtime, store returned handle in a
733  // global variable and save a reference in GpuBinaryHandle to be cleaned up
734  // in destructor on exit. Then associate all known kernels with the GPU binary
735  // handle so CUDA runtime can figure out what to call on the GPU side.
736  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
737  if (!CudaGpuBinaryFileName.empty()) {
738  auto VFS = CGM.getFileSystem();
739  auto CudaGpuBinaryOrErr =
740  VFS->getBufferForFile(CudaGpuBinaryFileName, -1, false);
741  if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
742  CGM.getDiags().Report(diag::err_cannot_open_file)
743  << CudaGpuBinaryFileName << EC.message();
744  return nullptr;
745  }
746  CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
747  }
748 
749  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
750  llvm::FunctionType::get(VoidTy, false),
751  llvm::GlobalValue::InternalLinkage,
752  addUnderscoredPrefixToName("_module_ctor"), &TheModule);
753  llvm::BasicBlock *CtorEntryBB =
754  llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
755  CGBuilderTy CtorBuilder(CGM, Context);
756 
757  CtorBuilder.SetInsertPoint(CtorEntryBB);
758 
759  const char *FatbinConstantName;
760  const char *FatbinSectionName;
761  const char *ModuleIDSectionName;
762  StringRef ModuleIDPrefix;
763  llvm::Constant *FatBinStr;
764  unsigned FatMagic;
765  if (IsHIP) {
766  FatbinConstantName = ".hip_fatbin";
767  FatbinSectionName = ".hipFatBinSegment";
768 
769  ModuleIDSectionName = "__hip_module_id";
770  ModuleIDPrefix = "__hip_";
771 
772  if (CudaGpuBinary) {
773  // If fatbin is available from early finalization, create a string
774  // literal containing the fat binary loaded from the given file.
775  const unsigned HIPCodeObjectAlign = 4096;
776  FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
777  FatbinConstantName, HIPCodeObjectAlign);
778  } else {
779  // If fatbin is not available, create an external symbol
780  // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
781  // to contain the fat binary but will be populated somewhere else,
782  // e.g. by lld through link script.
783  FatBinStr = new llvm::GlobalVariable(
784  CGM.getModule(), CGM.Int8Ty,
785  /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
786  "__hip_fatbin_" + CGM.getContext().getCUIDHash(), nullptr,
787  llvm::GlobalVariable::NotThreadLocal);
788  cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
789  }
790 
791  FatMagic = HIPFatMagic;
792  } else {
793  if (RelocatableDeviceCode)
794  FatbinConstantName = CGM.getTriple().isMacOSX()
795  ? "__NV_CUDA,__nv_relfatbin"
796  : "__nv_relfatbin";
797  else
798  FatbinConstantName =
799  CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin";
800  // NVIDIA's cuobjdump looks for fatbins in this section.
801  FatbinSectionName =
802  CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment";
803 
804  ModuleIDSectionName = CGM.getTriple().isMacOSX()
805  ? "__NV_CUDA,__nv_module_id"
806  : "__nv_module_id";
807  ModuleIDPrefix = "__nv_";
808 
809  // For CUDA, create a string literal containing the fat binary loaded from
810  // the given file.
811  FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()), "",
812  FatbinConstantName, 8);
813  FatMagic = CudaFatMagic;
814  }
815 
816  // Create initialized wrapper structure that points to the loaded GPU binary
817  ConstantInitBuilder Builder(CGM);
818  auto Values = Builder.beginStruct(FatbinWrapperTy);
819  // Fatbin wrapper magic.
820  Values.addInt(IntTy, FatMagic);
821  // Fatbin version.
822  Values.addInt(IntTy, 1);
823  // Data.
824  Values.add(FatBinStr);
825  // Unused in fatbin v1.
826  Values.add(llvm::ConstantPointerNull::get(PtrTy));
827  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
828  addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
829  /*constant*/ true);
830  FatbinWrapper->setSection(FatbinSectionName);
831 
832  // There is only one HIP fat binary per linked module, however there are
833  // multiple constructor functions. Make sure the fat binary is registered
834  // only once. The constructor functions are executed by the dynamic loader
835  // before the program gains control. The dynamic loader cannot execute the
836  // constructor functions concurrently since doing that would not guarantee
837  // thread safety of the loaded program. Therefore we can assume sequential
838  // execution of constructor functions here.
839  if (IsHIP) {
840  auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
841  : llvm::GlobalValue::ExternalLinkage;
842  llvm::BasicBlock *IfBlock =
843  llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
844  llvm::BasicBlock *ExitBlock =
845  llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
846  // The name, size, and initialization pattern of this variable is part
847  // of HIP ABI.
848  GpuBinaryHandle = new llvm::GlobalVariable(
849  TheModule, PtrTy, /*isConstant=*/false, Linkage,
850  /*Initializer=*/
851  CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) : nullptr,
852  CudaGpuBinary
853  ? "__hip_gpubin_handle"
854  : "__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
855  GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
856  // Prevent the weak symbol in different shared libraries being merged.
857  if (Linkage != llvm::GlobalValue::InternalLinkage)
858  GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
859  Address GpuBinaryAddr(
860  GpuBinaryHandle, PtrTy,
861  CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
862  {
863  auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
864  llvm::Constant *Zero =
865  llvm::Constant::getNullValue(HandleValue->getType());
866  llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
867  CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
868  }
869  {
870  CtorBuilder.SetInsertPoint(IfBlock);
871  // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
872  llvm::CallInst *RegisterFatbinCall =
873  CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
874  CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
875  CtorBuilder.CreateBr(ExitBlock);
876  }
877  {
878  CtorBuilder.SetInsertPoint(ExitBlock);
879  // Call __hip_register_globals(GpuBinaryHandle);
880  if (RegisterGlobalsFunc) {
881  auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
882  CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
883  }
884  }
885  } else if (!RelocatableDeviceCode) {
886  // Register binary with CUDA runtime. This is substantially different in
887  // default mode vs. separate compilation!
888  // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
889  llvm::CallInst *RegisterFatbinCall =
890  CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
891  GpuBinaryHandle = new llvm::GlobalVariable(
892  TheModule, PtrTy, false, llvm::GlobalValue::InternalLinkage,
893  llvm::ConstantPointerNull::get(PtrTy), "__cuda_gpubin_handle");
894  GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
895  CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
896  CGM.getPointerAlign());
897 
898  // Call __cuda_register_globals(GpuBinaryHandle);
899  if (RegisterGlobalsFunc)
900  CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
901 
902  // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
903  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
904  CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
905  // void __cudaRegisterFatBinaryEnd(void **);
906  llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
907  llvm::FunctionType::get(VoidTy, PtrTy, false),
908  "__cudaRegisterFatBinaryEnd");
909  CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
910  }
911  } else {
912  // Generate a unique module ID.
913  SmallString<64> ModuleID;
914  llvm::raw_svector_ostream OS(ModuleID);
915  OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
916  llvm::Constant *ModuleIDConstant = makeConstantArray(
917  std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
918 
919  // Create an alias for the FatbinWrapper that nvcc will look for.
920  llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
921  Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
922 
923  // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
924  // void *, void (*)(void **))
925  SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
926  RegisterLinkedBinaryName += ModuleID;
927  llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
928  getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
929 
930  assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
931  llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
932  makeDummyFunction(getCallbackFnTy())};
933  CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
934  }
935 
936  // Create destructor and register it with atexit() the way NVCC does it. Doing
937  // it during regular destructor phase worked in CUDA before 9.2 but results in
938  // double-free in 9.2.
939  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
940  // extern "C" int atexit(void (*f)(void));
941  llvm::FunctionType *AtExitTy =
942  llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
943  llvm::FunctionCallee AtExitFunc =
944  CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
945  /*Local=*/true);
946  CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
947  }
948 
949  CtorBuilder.CreateRetVoid();
950  return ModuleCtorFunc;
951 }
952 
953 /// Creates a global destructor function that unregisters the GPU code blob
954 /// registered by constructor.
955 ///
956 /// For CUDA:
957 /// \code
958 /// void __cuda_module_dtor() {
959 /// __cudaUnregisterFatBinary(Handle);
960 /// }
961 /// \endcode
962 ///
963 /// For HIP:
964 /// \code
965 /// void __hip_module_dtor() {
966 /// if (__hip_gpubin_handle) {
967 /// __hipUnregisterFatBinary(__hip_gpubin_handle);
968 /// __hip_gpubin_handle = 0;
969 /// }
970 /// }
971 /// \endcode
972 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
973  // No need for destructor if we don't have a handle to unregister.
974  if (!GpuBinaryHandle)
975  return nullptr;
976 
977  // void __cudaUnregisterFatBinary(void ** handle);
978  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
979  llvm::FunctionType::get(VoidTy, PtrTy, false),
980  addUnderscoredPrefixToName("UnregisterFatBinary"));
981 
982  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
983  llvm::FunctionType::get(VoidTy, false),
984  llvm::GlobalValue::InternalLinkage,
985  addUnderscoredPrefixToName("_module_dtor"), &TheModule);
986 
987  llvm::BasicBlock *DtorEntryBB =
988  llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
989  CGBuilderTy DtorBuilder(CGM, Context);
990  DtorBuilder.SetInsertPoint(DtorEntryBB);
991 
992  Address GpuBinaryAddr(
993  GpuBinaryHandle, GpuBinaryHandle->getValueType(),
994  CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
995  auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
996  // There is only one HIP fat binary per linked module, however there are
997  // multiple destructor functions. Make sure the fat binary is unregistered
998  // only once.
999  if (CGM.getLangOpts().HIP) {
1000  llvm::BasicBlock *IfBlock =
1001  llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
1002  llvm::BasicBlock *ExitBlock =
1003  llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
1004  llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
1005  llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
1006  DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1007 
1008  DtorBuilder.SetInsertPoint(IfBlock);
1009  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1010  DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
1011  DtorBuilder.CreateBr(ExitBlock);
1012 
1013  DtorBuilder.SetInsertPoint(ExitBlock);
1014  } else {
1015  DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1016  }
1017  DtorBuilder.CreateRetVoid();
1018  return ModuleDtorFunc;
1019 }
1020 
1022  return new CGNVCUDARuntime(CGM);
1023 }
1024 
1025 void CGNVCUDARuntime::internalizeDeviceSideVar(
1026  const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1027  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1028  // global variables become internal definitions. These have to be internal in
1029  // order to prevent name conflicts with global host variables with the same
1030  // name in a different TUs.
1031  //
1032  // For -fgpu-rdc, the shadow variables should not be internalized because
1033  // they may be accessed by different TU.
1034  if (CGM.getLangOpts().GPURelocatableDeviceCode)
1035  return;
1036 
1037  // __shared__ variables are odd. Shadows do get created, but
1038  // they are not registered with the CUDA runtime, so they
1039  // can't really be used to access their device-side
1040  // counterparts. It's not clear yet whether it's nvcc's bug or
1041  // a feature, but we've got to do the same for compatibility.
1042  if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||
1043  D->hasAttr<CUDASharedAttr>() ||
1046  Linkage = llvm::GlobalValue::InternalLinkage;
1047  }
1048 }
1049 
1050 void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1051  llvm::GlobalVariable &GV) {
1052  if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()) {
1053  // Shadow variables and their properties must be registered with CUDA
1054  // runtime. Skip Extern global variables, which will be registered in
1055  // the TU where they are defined.
1056  //
1057  // Don't register a C++17 inline variable. The local symbol can be
1058  // discarded and referencing a discarded local symbol from outside the
1059  // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1060  //
1061  // HIP managed variables need to be always recorded in device and host
1062  // compilations for transformation.
1063  //
1064  // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1065  // added to llvm.compiler-used, therefore they are safe to be registered.
1066  if ((!D->hasExternalStorage() && !D->isInline()) ||
1067  CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1068  D->hasAttr<HIPManagedAttr>()) {
1069  registerDeviceVar(D, GV, !D->hasDefinition(),
1070  D->hasAttr<CUDAConstantAttr>());
1071  }
1072  } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() ||
1074  // Builtin surfaces and textures and their template arguments are
1075  // also registered with CUDA runtime.
1076  const auto *TD = cast<ClassTemplateSpecializationDecl>(
1077  D->getType()->castAs<RecordType>()->getDecl());
1078  const TemplateArgumentList &Args = TD->getTemplateArgs();
1079  if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1080  assert(Args.size() == 2 &&
1081  "Unexpected number of template arguments of CUDA device "
1082  "builtin surface type.");
1083  auto SurfType = Args[1].getAsIntegral();
1084  if (!D->hasExternalStorage())
1085  registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1086  } else {
1087  assert(Args.size() == 3 &&
1088  "Unexpected number of template arguments of CUDA device "
1089  "builtin texture type.");
1090  auto TexType = Args[1].getAsIntegral();
1091  auto Normalized = Args[2].getAsIntegral();
1092  if (!D->hasExternalStorage())
1093  registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1094  Normalized.getZExtValue());
1095  }
1096  }
1097 }
1098 
1099 // Transform managed variables to pointers to managed variables in device code.
1100 // Each use of the original managed variable is replaced by a load from the
1101 // transformed managed variable. The transformed managed variable contains
1102 // the address of managed memory which will be allocated by the runtime.
1103 void CGNVCUDARuntime::transformManagedVars() {
1104  for (auto &&Info : DeviceVars) {
1105  llvm::GlobalVariable *Var = Info.Var;
1106  if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1107  Info.Flags.isManaged()) {
1108  auto *ManagedVar = new llvm::GlobalVariable(
1109  CGM.getModule(), Var->getType(),
1110  /*isConstant=*/false, Var->getLinkage(),
1111  /*Init=*/Var->isDeclaration()
1112  ? nullptr
1113  : llvm::ConstantPointerNull::get(Var->getType()),
1114  /*Name=*/"", /*InsertBefore=*/nullptr,
1115  llvm::GlobalVariable::NotThreadLocal,
1116  CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1117  ? LangAS::cuda_device
1118  : LangAS::Default));
1119  ManagedVar->setDSOLocal(Var->isDSOLocal());
1120  ManagedVar->setVisibility(Var->getVisibility());
1121  ManagedVar->setExternallyInitialized(true);
1122  replaceManagedVar(Var, ManagedVar);
1123  ManagedVar->takeName(Var);
1124  Var->setName(Twine(ManagedVar->getName()) + ".managed");
1125  // Keep managed variables even if they are not used in device code since
1126  // they need to be allocated by the runtime.
1127  if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1128  assert(!ManagedVar->isDeclaration());
1129  CGM.addCompilerUsedGlobal(Var);
1130  CGM.addCompilerUsedGlobal(ManagedVar);
1131  }
1132  }
1133  }
1134 }
1135 
1136 // Creates offloading entries for all the kernels and globals that must be
1137 // registered. The linker will provide a pointer to this section so we can
1138 // register the symbols with the linked device image.
1139 void CGNVCUDARuntime::createOffloadingEntries() {
1140  StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1141  : "cuda_offloading_entries";
1142  llvm::Module &M = CGM.getModule();
1143  for (KernelInfo &I : EmittedKernels)
1144  llvm::offloading::emitOffloadingEntry(
1145  M, KernelHandles[I.Kernel->getName()],
1146  getDeviceSideName(cast<NamedDecl>(I.D)), /*Flags=*/0, /*Data=*/0,
1147  llvm::offloading::OffloadGlobalEntry, Section);
1148 
1149  for (VarInfo &I : DeviceVars) {
1150  uint64_t VarSize =
1151  CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1152  int32_t Flags =
1153  (I.Flags.isExtern()
1154  ? static_cast<int32_t>(llvm::offloading::OffloadGlobalExtern)
1155  : 0) |
1156  (I.Flags.isConstant()
1157  ? static_cast<int32_t>(llvm::offloading::OffloadGlobalConstant)
1158  : 0) |
1159  (I.Flags.isNormalized()
1160  ? static_cast<int32_t>(llvm::offloading::OffloadGlobalNormalized)
1161  : 0);
1162  if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1163  llvm::offloading::emitOffloadingEntry(
1164  M, I.Var, getDeviceSideName(I.D), VarSize,
1165  (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1166  : llvm::offloading::OffloadGlobalEntry) |
1167  Flags,
1168  /*Data=*/0, Section);
1169  } else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1170  llvm::offloading::emitOffloadingEntry(
1171  M, I.Var, getDeviceSideName(I.D), VarSize,
1172  llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1173  I.Flags.getSurfTexType(), Section);
1174  } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1175  llvm::offloading::emitOffloadingEntry(
1176  M, I.Var, getDeviceSideName(I.D), VarSize,
1177  llvm::offloading::OffloadGlobalTextureEntry | Flags,
1178  I.Flags.getSurfTexType(), Section);
1179  }
1180  }
1181 }
1182 
1183 // Returns module constructor to be added.
1184 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1185  transformManagedVars();
1186  if (CGM.getLangOpts().CUDAIsDevice) {
1187  // Mark ODR-used device variables as compiler used to prevent it from being
1188  // eliminated by optimization. This is necessary for device variables
1189  // ODR-used by host functions. Sema correctly marks them as ODR-used no
1190  // matter whether they are ODR-used by device or host functions.
1191  //
1192  // We do not need to do this if the variable has used attribute since it
1193  // has already been added.
1194  //
1195  // Static device variables have been externalized at this point, therefore
1196  // variables with LLVM private or internal linkage need not be added.
1197  for (auto &&Info : DeviceVars) {
1198  auto Kind = Info.Flags.getKind();
1199  if (!Info.Var->isDeclaration() &&
1200  !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1202  Kind == DeviceVarFlags::Surface ||
1203  Kind == DeviceVarFlags::Texture) &&
1204  Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1205  CGM.addCompilerUsedGlobal(Info.Var);
1206  }
1207  }
1208  return nullptr;
1209  }
1210  if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1211  createOffloadingEntries();
1212  else
1213  return makeModuleCtorFunction();
1214 
1215  return nullptr;
1216 }
1217 
1218 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1219  GlobalDecl GD) {
1220  auto Loc = KernelHandles.find(F->getName());
1221  if (Loc != KernelHandles.end()) {
1222  auto OldHandle = Loc->second;
1223  if (KernelStubs[OldHandle] == F)
1224  return OldHandle;
1225 
1226  // We've found the function name, but F itself has changed, so we need to
1227  // update the references.
1228  if (CGM.getLangOpts().HIP) {
1229  // For HIP compilation the handle itself does not change, so we only need
1230  // to update the Stub value.
1231  KernelStubs[OldHandle] = F;
1232  return OldHandle;
1233  }
1234  // For non-HIP compilation, erase the old Stub and fall-through to creating
1235  // new entries.
1236  KernelStubs.erase(OldHandle);
1237  }
1238 
1239  if (!CGM.getLangOpts().HIP) {
1240  KernelHandles[F->getName()] = F;
1241  KernelStubs[F] = F;
1242  return F;
1243  }
1244 
1245  auto *Var = new llvm::GlobalVariable(
1246  TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1247  /*Initializer=*/nullptr,
1248  CGM.getMangledName(
1249  GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1250  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1251  Var->setDSOLocal(F->isDSOLocal());
1252  Var->setVisibility(F->getVisibility());
1253  auto *FD = cast<FunctionDecl>(GD.getDecl());
1254  auto *FT = FD->getPrimaryTemplate();
1255  if (!FT || FT->isThisDeclarationADefinition())
1256  CGM.maybeSetTrivialComdat(*FD, *Var);
1257  KernelHandles[F->getName()] = Var;
1258  KernelStubs[Var] = F;
1259  return Var;
1260 }
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
Definition: CGCUDANV.cpp:497
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
Definition: CGCUDANV.cpp:209
unsigned Offset
Definition: Format.cpp:2978
llvm::raw_ostream & OS
Definition: Logger.cpp:24
VarDecl * Variable
Definition: SemaObjC.cpp:753
SourceLocation Loc
Definition: SemaObjC.cpp:755
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:760
const TargetInfo * getAuxTargetInfo() const
Definition: ASTContext.h:761
CharUnits - This is an opaque type for sizes expressed in character units.
Definition: CharUnits.h:38
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
Definition: Address.h:220
llvm::PointerType * getType() const
Return the type of the pointer value.
Definition: Address.h:176
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
Definition: CGBuilder.h:143
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
Definition: CGBuilder.h:151
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
Definition: CGBuilder.h:108
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
Definition: CGCall.h:257
void add(RValue rvalue, QualType type)
Definition: CGCall.h:281
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Emits a call or invoke instruction to the given runtime function.
Definition: CGCall.cpp:4952
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
Definition: CGCall.cpp:5108
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
Definition: CGExpr.cpp:116
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
Definition: CGStmt.cpp:598
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Definition: CGExpr.cpp:147
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
Definition: CGStmt.cpp:578
const LangOptions & getLangOpts() const
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
Definition: CGCall.h:351
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
Definition: CGCall.h:355
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
Definition: DeclBase.h:1436
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Definition: DeclBase.cpp:1802
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86
SourceLocation getLocation() const
Definition: DeclBase.h:445
TranslationUnitDecl * getTranslationUnitDecl()
Definition: DeclBase.cpp:486
bool hasAttr() const
Definition: DeclBase.h:583
Represents a function declaration or definition.
Definition: Decl.h:1972
const ParmVarDecl * getParamDecl(unsigned i) const
Definition: Decl.h:2709
GlobalDecl - represents a global declaration.
Definition: GlobalDecl.h:56
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
Definition: GlobalDecl.h:194
const Decl * getDecl() const
Definition: GlobalDecl.h:103
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
Definition: LangOptions.h:589
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
Definition: Mangle.h:45
bool shouldMangleDeclName(const NamedDecl *D)
Definition: Mangle.cpp:105
void mangleName(GlobalDecl GD, raw_ostream &)
Definition: Mangle.cpp:139
This represents a decl that may have a name.
Definition: Decl.h:249
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Definition: Decl.h:270
Represents a parameter to a function.
Definition: Decl.h:1762
A (possibly-)qualified type.
Definition: Type.h:940
QualType getCanonicalType() const
Definition: Type.h:7423
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5561
RecordDecl * getDecl() const
Definition: Type.h:5571
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
Definition: TargetCXXABI.h:136
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
Definition: TargetCXXABI.h:122
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
Definition: TargetInfo.h:1327
A template argument list.
Definition: DeclTemplate.h:244
unsigned size() const
Retrieve the number of template arguments in this template argument list.
Definition: DeclTemplate.h:280
The top declaration context.
Definition: Decl.h:84
The base class of the type hierarchy.
Definition: Type.h:1813
const T * castAs() const
Member-template castAs<specific type>.
Definition: Type.h:8227
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
Definition: Type.cpp:4951
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Definition: Type.cpp:4958
QualType getType() const
Definition: Decl.h:718
QualType getType() const
Definition: Value.cpp:234
Represents a variable declaration or definition.
Definition: Decl.h:919
bool isInline() const
Whether this variable is (C++1z) inline.
Definition: Decl.h:1532
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
Definition: Decl.h:1205
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
Definition: Decl.cpp:2376
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
Definition: CGCUDANV.cpp:1021
constexpr XRayInstrMask None
Definition: XRayInstr.h:38
bool Zero(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1877
std::unique_ptr< DiagnosticConsumer > create(StringRef OutputFile, DiagnosticOptions *Diags, bool MergeChildRecords=false)
Returns a DiagnosticConsumer that serializes diagnostics to a bitcode file.
@ VFS
Remove unused -ivfsoverlay arguments.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
Definition: Cuda.cpp:66
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Definition: Cuda.cpp:245
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
Definition: Linkage.h:24
@ HiddenVisibility
Objects with "hidden" visibility are not seen by the dynamic linker.
Definition: Visibility.h:37
unsigned long uint64_t
llvm::IntegerType * IntTy
int