22 #include "llvm/Frontend/Offloading/Utility.h"
23 #include "llvm/IR/BasicBlock.h"
24 #include "llvm/IR/Constants.h"
25 #include "llvm/IR/DerivedTypes.h"
26 #include "llvm/IR/ReplaceConstant.h"
27 #include "llvm/Support/Format.h"
28 #include "llvm/Support/VirtualFileSystem.h"
30 using namespace clang;
31 using namespace CodeGen;
34 constexpr
unsigned CudaFatMagic = 0x466243b1;
35 constexpr
unsigned HIPFatMagic = 0x48495046;
40 llvm::IntegerType *IntTy, *SizeTy;
42 llvm::PointerType *PtrTy;
45 llvm::LLVMContext &Context;
47 llvm::Module &TheModule;
57 llvm::DenseMap<StringRef, llvm::GlobalValue *> KernelHandles;
59 llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
61 llvm::GlobalVariable *Var;
69 llvm::GlobalVariable *GpuBinaryHandle =
nullptr;
71 bool RelocatableDeviceCode;
73 std::unique_ptr<MangleContext> DeviceMC;
75 llvm::Constant *Zeros[2];
77 llvm::FunctionCallee getSetupArgumentFn()
const;
78 llvm::FunctionCallee getLaunchFn()
const;
80 llvm::FunctionType *getRegisterGlobalsFnTy()
const;
81 llvm::FunctionType *getCallbackFnTy()
const;
82 llvm::FunctionType *getRegisterLinkedBinaryFnTy()
const;
83 std::string addPrefixToName(StringRef FuncName)
const;
84 std::string addUnderscoredPrefixToName(StringRef FuncName)
const;
87 llvm::Function *makeRegisterGlobalsFn();
92 llvm::Constant *makeConstantString(
const std::string &Str,
93 const std::string &Name =
"") {
94 auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95 return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
96 ConstStr.getPointer(), Zeros);
102 llvm::Constant *makeConstantArray(StringRef Str,
104 StringRef SectionName =
"",
105 unsigned Alignment = 0,
106 bool AddNull =
false) {
107 llvm::Constant *
Value =
108 llvm::ConstantDataArray::getString(Context, Str, AddNull);
109 auto *GV =
new llvm::GlobalVariable(
111 llvm::GlobalValue::PrivateLinkage,
Value, Name);
112 if (!SectionName.empty()) {
113 GV->setSection(SectionName);
119 GV->setAlignment(llvm::Align(Alignment));
120 return llvm::ConstantExpr::getGetElementPtr(GV->getValueType(), GV, Zeros);
124 llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
125 assert(FnTy->getReturnType()->isVoidTy() &&
126 "Can only generate dummy functions returning void!");
127 llvm::Function *DummyFunc = llvm::Function::Create(
128 FnTy, llvm::GlobalValue::InternalLinkage,
"dummy", &TheModule);
130 llvm::BasicBlock *DummyBlock =
131 llvm::BasicBlock::Create(Context,
"", DummyFunc);
133 FuncBuilder.SetInsertPoint(DummyBlock);
134 FuncBuilder.CreateRetVoid();
141 std::string getDeviceSideName(
const NamedDecl *ND)
override;
143 void registerDeviceVar(
const VarDecl *VD, llvm::GlobalVariable &Var,
144 bool Extern,
bool Constant) {
145 DeviceVars.push_back({&Var,
151 void registerDeviceSurf(
const VarDecl *VD, llvm::GlobalVariable &Var,
152 bool Extern,
int Type) {
153 DeviceVars.push_back({&Var,
155 {DeviceVarFlags::Surface, Extern,
false,
159 void registerDeviceTex(
const VarDecl *VD, llvm::GlobalVariable &Var,
160 bool Extern,
int Type,
bool Normalized) {
161 DeviceVars.push_back({&Var,
163 {DeviceVarFlags::Texture, Extern,
false,
164 false, Normalized,
Type}});
168 llvm::Function *makeModuleCtorFunction();
170 llvm::Function *makeModuleDtorFunction();
172 void transformManagedVars();
174 void createOffloadingEntries();
179 llvm::GlobalValue *getKernelHandle(llvm::Function *F,
GlobalDecl GD)
override;
180 llvm::Function *getKernelStub(llvm::GlobalValue *Handle)
override {
181 auto Loc = KernelStubs.find(Handle);
182 assert(
Loc != KernelStubs.end());
186 void handleVarRegistration(
const VarDecl *VD,
187 llvm::GlobalVariable &Var)
override;
189 internalizeDeviceSideVar(
const VarDecl *D,
190 llvm::GlobalValue::LinkageTypes &
Linkage)
override;
192 llvm::Function *finalizeModule()
override;
197 std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName)
const {
198 if (CGM.getLangOpts().HIP)
199 return ((Twine(
"hip") + Twine(FuncName)).str());
200 return ((Twine(
"cuda") + Twine(FuncName)).str());
203 CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName)
const {
204 if (CGM.getLangOpts().HIP)
205 return ((Twine(
"__hip") + Twine(FuncName)).str());
206 return ((Twine(
"__cuda") + Twine(FuncName)).str());
216 return std::unique_ptr<MangleContext>(
227 TheModule(CGM.getModule()),
228 RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
233 Zeros[0] = llvm::ConstantInt::get(SizeTy, 0);
238 llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn()
const {
240 llvm::Type *Params[] = {PtrTy, SizeTy, SizeTy};
241 return CGM.CreateRuntimeFunction(
242 llvm::FunctionType::get(IntTy, Params,
false),
243 addPrefixToName(
"SetupArgument"));
246 llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn()
const {
247 if (CGM.getLangOpts().HIP) {
249 return CGM.CreateRuntimeFunction(
250 llvm::FunctionType::get(IntTy, PtrTy,
false),
"hipLaunchByPtr");
253 return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, PtrTy,
false),
257 llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy()
const {
258 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
261 llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy()
const {
262 return llvm::FunctionType::get(VoidTy, PtrTy,
false);
265 llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy()
const {
266 auto *CallbackFnTy = getCallbackFnTy();
267 auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
268 llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), PtrTy,
269 PtrTy, CallbackFnTy->getPointerTo()};
270 return llvm::FunctionType::get(VoidTy, Params,
false);
273 std::string CGNVCUDARuntime::getDeviceSideName(
const NamedDecl *ND) {
276 if (
auto *FD = dyn_cast<FunctionDecl>(ND))
277 GD =
GlobalDecl(FD, KernelReferenceKind::Kernel);
280 std::string DeviceSideName;
282 if (CGM.getLangOpts().CUDAIsDevice)
283 MC = &CGM.getCXXABI().getMangleContext();
288 llvm::raw_svector_ostream Out(Buffer);
290 DeviceSideName = std::string(Out.str());
295 if (CGM.getContext().shouldExternalize(ND) &&
296 CGM.getLangOpts().GPURelocatableDeviceCode) {
298 llvm::raw_svector_ostream Out(Buffer);
299 Out << DeviceSideName;
300 CGM.printPostfixForExternalizedDecl(Out, ND);
301 DeviceSideName = std::string(Out.str());
303 return DeviceSideName;
310 dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.
CurFn->getName()])) {
311 GV->setLinkage(CGF.
CurFn->getLinkage());
312 GV->setInitializer(CGF.
CurFn);
315 CudaFeature::CUDA_USES_NEW_LAUNCH) ||
317 emitDeviceStubBodyNew(CGF, Args);
319 emitDeviceStubBodyLegacy(CGF, Args);
332 PtrTy, CharUnits::fromQuantity(16),
"kernel_args",
333 llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
335 for (
unsigned i = 0; i < Args.size(); ++i) {
337 llvm::Value *VoidVarPtr = CGF.
Builder.CreatePointerCast(VarPtr, PtrTy);
339 VoidVarPtr, CGF.
Builder.CreateConstGEP1_32(
356 DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
357 std::string KernelLaunchAPI =
"LaunchKernel";
359 LangOptions::GPUDefaultStreamKind::PerThread) {
361 KernelLaunchAPI = KernelLaunchAPI +
"_spt";
363 KernelLaunchAPI = KernelLaunchAPI +
"_ptsz";
365 auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
367 CGM.getContext().Idents.get(LaunchKernelName);
369 for (
auto *Result : DC->
lookup(&cudaLaunchKernelII)) {
371 cudaLaunchKernelFD = FD;
374 if (cudaLaunchKernelFD ==
nullptr) {
376 "Can't find declaration for " + LaunchKernelName);
383 CGF.
CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8),
"grid_dim");
385 CGF.
CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8),
"block_dim");
389 llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
390 llvm::FunctionType::get(IntTy,
396 addUnderscoredPrefixToName(
"PopCallConfiguration"));
405 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
407 LaunchKernelArgs.
add(RValue::get(Kernel),
409 LaunchKernelArgs.
add(RValue::getAggregate(GridDim), Dim3Ty);
410 LaunchKernelArgs.
add(RValue::getAggregate(BlockDim), Dim3Ty);
411 LaunchKernelArgs.
add(RValue::get(KernelArgs, CGF),
420 llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
421 llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
424 CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
425 llvm::FunctionCallee cudaLaunchKernelFn =
426 CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
433 if (CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft() &&
435 llvm::Function *KernelFunction = llvm::cast<llvm::Function>(Kernel);
436 std::string GlobalVarName = (KernelFunction->getName() +
".id").str();
438 llvm::GlobalVariable *HandleVar =
439 CGM.getModule().getNamedGlobal(GlobalVarName);
441 HandleVar =
new llvm::GlobalVariable(
442 CGM.getModule(), CGM.Int8Ty,
443 false, KernelFunction->getLinkage(),
444 llvm::ConstantInt::get(CGM.Int8Ty, 0), GlobalVarName);
445 HandleVar->setDSOLocal(KernelFunction->isDSOLocal());
446 HandleVar->setVisibility(KernelFunction->getVisibility());
447 if (KernelFunction->hasComdat())
448 HandleVar->setComdat(CGM.getModule().getOrInsertComdat(GlobalVarName));
452 HandleVar, CharUnits::One(),
464 llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
467 for (
const VarDecl *A : Args) {
468 auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
470 llvm::Value *Args[] = {
473 llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
474 llvm::ConstantInt::get(SizeTy,
Offset.getQuantity()),
477 llvm::Constant *
Zero = llvm::ConstantInt::get(IntTy, 0);
478 llvm::Value *CBZero = CGF.
Builder.CreateICmpEQ(CB,
Zero);
480 CGF.
Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
486 llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
488 CGF.
Builder.CreatePointerCast(KernelHandles[CGF.
CurFn->getName()], PtrTy);
498 llvm::GlobalVariable *ManagedVar) {
500 for (
auto &&VarUse : Var->uses()) {
501 WorkList.push_back({VarUse.getUser()});
503 while (!WorkList.empty()) {
504 auto &&WorkItem = WorkList.pop_back_val();
505 auto *
U = WorkItem.back();
506 if (isa<llvm::ConstantExpr>(
U)) {
507 for (
auto &&UU :
U->uses()) {
508 WorkItem.push_back(UU.getUser());
509 WorkList.push_back(WorkItem);
514 if (
auto *I = dyn_cast<llvm::Instruction>(
U)) {
515 llvm::Value *OldV = Var;
516 llvm::Instruction *NewV =
517 new llvm::LoadInst(Var->getType(), ManagedVar,
"ld.managed",
false,
518 llvm::Align(Var->getAlignment()), I);
522 for (
auto &&Op : WorkItem) {
523 auto *CE = cast<llvm::ConstantExpr>(Op);
524 auto *NewInst = CE->getAsInstruction();
525 NewInst->insertBefore(*I->getParent(), I->getIterator());
526 NewInst->replaceUsesOfWith(OldV, NewV);
530 I->replaceUsesOfWith(OldV, NewV);
532 llvm_unreachable(
"Invalid use of managed variable");
551 llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
553 if (EmittedKernels.empty() && DeviceVars.empty())
556 llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
557 getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
558 addUnderscoredPrefixToName(
"_register_globals"), &TheModule);
559 llvm::BasicBlock *EntryBB =
560 llvm::BasicBlock::Create(Context,
"entry", RegisterKernelsFunc);
562 Builder.SetInsertPoint(EntryBB);
566 llvm::Type *RegisterFuncParams[] = {
567 PtrTy, PtrTy, PtrTy, PtrTy, IntTy,
568 PtrTy, PtrTy, PtrTy, PtrTy, IntTy->getPointerTo()};
569 llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
570 llvm::FunctionType::get(IntTy, RegisterFuncParams,
false),
571 addUnderscoredPrefixToName(
"RegisterFunction"));
576 llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
577 for (
auto &&I : EmittedKernels) {
578 llvm::Constant *KernelName =
579 makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
580 llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(PtrTy);
581 llvm::Value *Args[] = {
583 KernelHandles[I.Kernel->getName()],
586 llvm::ConstantInt::get(IntTy, -1),
591 llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
592 Builder.CreateCall(RegisterFunc, Args);
595 llvm::Type *VarSizeTy = IntTy;
597 if (CGM.getLangOpts().HIP ||
598 ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
603 llvm::Type *RegisterVarParams[] = {PtrTy, PtrTy, PtrTy, PtrTy,
604 IntTy, VarSizeTy, IntTy, IntTy};
605 llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
606 llvm::FunctionType::get(VoidTy, RegisterVarParams,
false),
607 addUnderscoredPrefixToName(
"RegisterVar"));
610 llvm::Type *RegisterManagedVarParams[] = {PtrTy, PtrTy, PtrTy,
611 PtrTy, VarSizeTy, IntTy};
612 llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
613 llvm::FunctionType::get(VoidTy, RegisterManagedVarParams,
false),
614 addUnderscoredPrefixToName(
"RegisterManagedVar"));
617 llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
618 llvm::FunctionType::get(
619 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy},
false),
620 addUnderscoredPrefixToName(
"RegisterSurface"));
623 llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
624 llvm::FunctionType::get(
625 VoidTy, {PtrTy, PtrTy, PtrTy, PtrTy, IntTy, IntTy, IntTy},
false),
626 addUnderscoredPrefixToName(
"RegisterTexture"));
627 for (
auto &&Info : DeviceVars) {
628 llvm::GlobalVariable *Var = Info.Var;
629 assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
630 "External variables should not show up here, except HIP managed "
632 llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
633 switch (Info.Flags.getKind()) {
636 CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
637 if (Info.Flags.isManaged()) {
638 assert(Var->getName().ends_with(
".managed") &&
639 "HIP managed variables not transformed");
640 auto *ManagedVar = CGM.getModule().getNamedGlobal(
641 Var->getName().drop_back(StringRef(
".managed").size()));
642 llvm::Value *Args[] = {
647 llvm::ConstantInt::get(VarSizeTy, VarSize),
648 llvm::ConstantInt::get(IntTy, Var->getAlignment())};
649 if (!Var->isDeclaration())
650 Builder.CreateCall(RegisterManagedVar, Args);
652 llvm::Value *Args[] = {
657 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
658 llvm::ConstantInt::get(VarSizeTy, VarSize),
659 llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
660 llvm::ConstantInt::get(IntTy, 0)};
661 Builder.CreateCall(RegisterVar, Args);
665 case DeviceVarFlags::Surface:
668 {&GpuBinaryHandlePtr, Var, VarName, VarName,
669 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
670 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
672 case DeviceVarFlags::Texture:
675 {&GpuBinaryHandlePtr, Var, VarName, VarName,
676 llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
677 llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
678 llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
683 Builder.CreateRetVoid();
684 return RegisterKernelsFunc;
706 llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
707 bool IsHIP = CGM.getLangOpts().HIP;
708 bool IsCUDA = CGM.getLangOpts().CUDA;
710 StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
711 if (CudaGpuBinaryFileName.empty() && !IsHIP)
713 if ((IsHIP || (IsCUDA && !RelocatableDeviceCode)) && EmittedKernels.empty() &&
718 llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
721 if (RelocatableDeviceCode && !RegisterGlobalsFunc)
722 RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
725 llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
726 llvm::FunctionType::get(PtrTy, PtrTy,
false),
727 addUnderscoredPrefixToName(
"RegisterFatBinary"));
729 llvm::StructType *FatbinWrapperTy =
730 llvm::StructType::get(IntTy, IntTy, PtrTy, PtrTy);
736 std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary =
nullptr;
737 if (!CudaGpuBinaryFileName.empty()) {
738 auto VFS = CGM.getFileSystem();
739 auto CudaGpuBinaryOrErr =
740 VFS->getBufferForFile(CudaGpuBinaryFileName, -1,
false);
741 if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
742 CGM.getDiags().Report(diag::err_cannot_open_file)
743 << CudaGpuBinaryFileName << EC.message();
746 CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
749 llvm::Function *ModuleCtorFunc = llvm::Function::Create(
750 llvm::FunctionType::get(VoidTy,
false),
751 llvm::GlobalValue::InternalLinkage,
752 addUnderscoredPrefixToName(
"_module_ctor"), &TheModule);
753 llvm::BasicBlock *CtorEntryBB =
754 llvm::BasicBlock::Create(Context,
"entry", ModuleCtorFunc);
757 CtorBuilder.SetInsertPoint(CtorEntryBB);
759 const char *FatbinConstantName;
760 const char *FatbinSectionName;
761 const char *ModuleIDSectionName;
762 StringRef ModuleIDPrefix;
763 llvm::Constant *FatBinStr;
766 FatbinConstantName =
".hip_fatbin";
767 FatbinSectionName =
".hipFatBinSegment";
769 ModuleIDSectionName =
"__hip_module_id";
770 ModuleIDPrefix =
"__hip_";
775 const unsigned HIPCodeObjectAlign = 4096;
776 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
777 FatbinConstantName, HIPCodeObjectAlign);
783 FatBinStr =
new llvm::GlobalVariable(
784 CGM.getModule(), CGM.Int8Ty,
785 true, llvm::GlobalValue::ExternalLinkage,
nullptr,
786 "__hip_fatbin_" + CGM.getContext().getCUIDHash(),
nullptr,
787 llvm::GlobalVariable::NotThreadLocal);
788 cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
791 FatMagic = HIPFatMagic;
793 if (RelocatableDeviceCode)
794 FatbinConstantName = CGM.getTriple().isMacOSX()
795 ?
"__NV_CUDA,__nv_relfatbin"
799 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__nv_fatbin" :
".nv_fatbin";
802 CGM.getTriple().isMacOSX() ?
"__NV_CUDA,__fatbin" :
".nvFatBinSegment";
804 ModuleIDSectionName = CGM.getTriple().isMacOSX()
805 ?
"__NV_CUDA,__nv_module_id"
807 ModuleIDPrefix =
"__nv_";
811 FatBinStr = makeConstantArray(std::string(CudaGpuBinary->getBuffer()),
"",
812 FatbinConstantName, 8);
813 FatMagic = CudaFatMagic;
818 auto Values = Builder.beginStruct(FatbinWrapperTy);
820 Values.addInt(IntTy, FatMagic);
822 Values.addInt(IntTy, 1);
824 Values.add(FatBinStr);
826 Values.add(llvm::ConstantPointerNull::get(PtrTy));
827 llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
828 addUnderscoredPrefixToName(
"_fatbin_wrapper"), CGM.getPointerAlign(),
830 FatbinWrapper->setSection(FatbinSectionName);
840 auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage
841 : llvm::GlobalValue::ExternalLinkage;
842 llvm::BasicBlock *IfBlock =
843 llvm::BasicBlock::Create(Context,
"if", ModuleCtorFunc);
844 llvm::BasicBlock *ExitBlock =
845 llvm::BasicBlock::Create(Context,
"exit", ModuleCtorFunc);
848 GpuBinaryHandle =
new llvm::GlobalVariable(
849 TheModule, PtrTy,
false,
Linkage,
851 CudaGpuBinary ? llvm::ConstantPointerNull::get(PtrTy) :
nullptr,
853 ?
"__hip_gpubin_handle"
854 :
"__hip_gpubin_handle_" + CGM.getContext().getCUIDHash());
855 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
857 if (
Linkage != llvm::GlobalValue::InternalLinkage)
860 GpuBinaryHandle, PtrTy,
861 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
863 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
864 llvm::Constant *
Zero =
865 llvm::Constant::getNullValue(HandleValue->getType());
866 llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue,
Zero);
867 CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
870 CtorBuilder.SetInsertPoint(IfBlock);
872 llvm::CallInst *RegisterFatbinCall =
873 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
874 CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
875 CtorBuilder.CreateBr(ExitBlock);
878 CtorBuilder.SetInsertPoint(ExitBlock);
880 if (RegisterGlobalsFunc) {
881 auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
882 CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
885 }
else if (!RelocatableDeviceCode) {
889 llvm::CallInst *RegisterFatbinCall =
890 CtorBuilder.CreateCall(RegisterFatbinFunc, FatbinWrapper);
891 GpuBinaryHandle =
new llvm::GlobalVariable(
892 TheModule, PtrTy,
false, llvm::GlobalValue::InternalLinkage,
893 llvm::ConstantPointerNull::get(PtrTy),
"__cuda_gpubin_handle");
894 GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
895 CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
896 CGM.getPointerAlign());
899 if (RegisterGlobalsFunc)
900 CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
904 CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
906 llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
907 llvm::FunctionType::get(VoidTy, PtrTy,
false),
908 "__cudaRegisterFatBinaryEnd");
909 CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
914 llvm::raw_svector_ostream
OS(ModuleID);
915 OS << ModuleIDPrefix << llvm::format(
"%" PRIx64, FatbinWrapper->getGUID());
916 llvm::Constant *ModuleIDConstant = makeConstantArray(
917 std::string(ModuleID),
"", ModuleIDSectionName, 32,
true);
921 Twine(
"__fatbinwrap") + ModuleID, FatbinWrapper);
926 RegisterLinkedBinaryName += ModuleID;
927 llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
928 getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
930 assert(RegisterGlobalsFunc &&
"Expecting at least dummy function!");
931 llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, ModuleIDConstant,
932 makeDummyFunction(getCallbackFnTy())};
933 CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
939 if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
941 llvm::FunctionType *AtExitTy =
942 llvm::FunctionType::get(IntTy, CleanupFn->getType(),
false);
943 llvm::FunctionCallee AtExitFunc =
944 CGM.CreateRuntimeFunction(AtExitTy,
"atexit", llvm::AttributeList(),
946 CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
949 CtorBuilder.CreateRetVoid();
950 return ModuleCtorFunc;
972 llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
974 if (!GpuBinaryHandle)
978 llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
979 llvm::FunctionType::get(VoidTy, PtrTy,
false),
980 addUnderscoredPrefixToName(
"UnregisterFatBinary"));
982 llvm::Function *ModuleDtorFunc = llvm::Function::Create(
983 llvm::FunctionType::get(VoidTy,
false),
984 llvm::GlobalValue::InternalLinkage,
985 addUnderscoredPrefixToName(
"_module_dtor"), &TheModule);
987 llvm::BasicBlock *DtorEntryBB =
988 llvm::BasicBlock::Create(Context,
"entry", ModuleDtorFunc);
990 DtorBuilder.SetInsertPoint(DtorEntryBB);
993 GpuBinaryHandle, GpuBinaryHandle->getValueType(),
994 CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
995 auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
999 if (CGM.getLangOpts().HIP) {
1000 llvm::BasicBlock *IfBlock =
1001 llvm::BasicBlock::Create(Context,
"if", ModuleDtorFunc);
1002 llvm::BasicBlock *ExitBlock =
1003 llvm::BasicBlock::Create(Context,
"exit", ModuleDtorFunc);
1004 llvm::Constant *
Zero = llvm::Constant::getNullValue(HandleValue->getType());
1005 llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue,
Zero);
1006 DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
1008 DtorBuilder.SetInsertPoint(IfBlock);
1009 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1010 DtorBuilder.CreateStore(
Zero, GpuBinaryAddr);
1011 DtorBuilder.CreateBr(ExitBlock);
1013 DtorBuilder.SetInsertPoint(ExitBlock);
1015 DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
1017 DtorBuilder.CreateRetVoid();
1018 return ModuleDtorFunc;
1022 return new CGNVCUDARuntime(CGM);
1025 void CGNVCUDARuntime::internalizeDeviceSideVar(
1034 if (CGM.getLangOpts().GPURelocatableDeviceCode)
1042 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>() ||
1043 D->
hasAttr<CUDASharedAttr>() ||
1046 Linkage = llvm::GlobalValue::InternalLinkage;
1050 void CGNVCUDARuntime::handleVarRegistration(
const VarDecl *D,
1051 llvm::GlobalVariable &GV) {
1052 if (D->
hasAttr<CUDADeviceAttr>() || D->
hasAttr<CUDAConstantAttr>()) {
1067 CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) ||
1068 D->
hasAttr<HIPManagedAttr>()) {
1070 D->
hasAttr<CUDAConstantAttr>());
1076 const auto *TD = cast<ClassTemplateSpecializationDecl>(
1079 if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1080 assert(Args.
size() == 2 &&
1081 "Unexpected number of template arguments of CUDA device "
1082 "builtin surface type.");
1083 auto SurfType = Args[1].getAsIntegral();
1085 registerDeviceSurf(D, GV, !D->
hasDefinition(), SurfType.getSExtValue());
1087 assert(Args.
size() == 3 &&
1088 "Unexpected number of template arguments of CUDA device "
1089 "builtin texture type.");
1090 auto TexType = Args[1].getAsIntegral();
1091 auto Normalized = Args[2].getAsIntegral();
1093 registerDeviceTex(D, GV, !D->
hasDefinition(), TexType.getSExtValue(),
1094 Normalized.getZExtValue());
1103 void CGNVCUDARuntime::transformManagedVars() {
1104 for (
auto &&Info : DeviceVars) {
1105 llvm::GlobalVariable *Var = Info.Var;
1107 Info.Flags.isManaged()) {
1108 auto *ManagedVar =
new llvm::GlobalVariable(
1109 CGM.getModule(), Var->getType(),
1110 false, Var->getLinkage(),
1111 Var->isDeclaration()
1113 : llvm::ConstantPointerNull::get(Var->getType()),
1115 llvm::GlobalVariable::NotThreadLocal,
1116 CGM.getContext().getTargetAddressSpace(CGM.getLangOpts().CUDAIsDevice
1117 ? LangAS::cuda_device
1118 : LangAS::Default));
1119 ManagedVar->setDSOLocal(Var->isDSOLocal());
1120 ManagedVar->setVisibility(Var->getVisibility());
1121 ManagedVar->setExternallyInitialized(
true);
1123 ManagedVar->takeName(Var);
1124 Var->setName(Twine(ManagedVar->getName()) +
".managed");
1127 if (CGM.getLangOpts().CUDAIsDevice && !Var->isDeclaration()) {
1128 assert(!ManagedVar->isDeclaration());
1129 CGM.addCompilerUsedGlobal(Var);
1130 CGM.addCompilerUsedGlobal(ManagedVar);
1139 void CGNVCUDARuntime::createOffloadingEntries() {
1140 StringRef Section = CGM.getLangOpts().HIP ?
"hip_offloading_entries"
1141 :
"cuda_offloading_entries";
1142 llvm::Module &M = CGM.getModule();
1143 for (KernelInfo &I : EmittedKernels)
1144 llvm::offloading::emitOffloadingEntry(
1145 M, KernelHandles[I.Kernel->getName()],
1146 getDeviceSideName(cast<NamedDecl>(I.D)), 0, 0,
1147 llvm::offloading::OffloadGlobalEntry, Section);
1149 for (VarInfo &I : DeviceVars) {
1151 CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1154 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalExtern)
1156 (I.Flags.isConstant()
1157 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalConstant)
1159 (I.Flags.isNormalized()
1160 ?
static_cast<int32_t
>(llvm::offloading::OffloadGlobalNormalized)
1163 llvm::offloading::emitOffloadingEntry(
1164 M, I.Var, getDeviceSideName(I.D), VarSize,
1165 (I.Flags.isManaged() ? llvm::offloading::OffloadGlobalManagedEntry
1166 : llvm::offloading::OffloadGlobalEntry) |
1169 }
else if (I.Flags.getKind() == DeviceVarFlags::Surface) {
1170 llvm::offloading::emitOffloadingEntry(
1171 M, I.Var, getDeviceSideName(I.D), VarSize,
1172 llvm::offloading::OffloadGlobalSurfaceEntry | Flags,
1173 I.Flags.getSurfTexType(), Section);
1174 }
else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1175 llvm::offloading::emitOffloadingEntry(
1176 M, I.Var, getDeviceSideName(I.D), VarSize,
1177 llvm::offloading::OffloadGlobalTextureEntry | Flags,
1178 I.Flags.getSurfTexType(), Section);
1184 llvm::Function *CGNVCUDARuntime::finalizeModule() {
1185 transformManagedVars();
1186 if (CGM.getLangOpts().CUDAIsDevice) {
1197 for (
auto &&Info : DeviceVars) {
1198 auto Kind = Info.Flags.getKind();
1199 if (!Info.Var->isDeclaration() &&
1200 !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage()) &&
1202 Kind == DeviceVarFlags::Surface ||
1203 Kind == DeviceVarFlags::Texture) &&
1204 Info.D->isUsed() && !Info.D->hasAttr<UsedAttr>()) {
1205 CGM.addCompilerUsedGlobal(Info.Var);
1210 if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1211 createOffloadingEntries();
1213 return makeModuleCtorFunction();
1218 llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1220 auto Loc = KernelHandles.find(F->getName());
1221 if (
Loc != KernelHandles.end()) {
1222 auto OldHandle =
Loc->second;
1223 if (KernelStubs[OldHandle] == F)
1228 if (CGM.getLangOpts().HIP) {
1231 KernelStubs[OldHandle] = F;
1236 KernelStubs.erase(OldHandle);
1239 if (!CGM.getLangOpts().HIP) {
1240 KernelHandles[F->getName()] = F;
1245 auto *Var =
new llvm::GlobalVariable(
1246 TheModule, F->getType(),
true, F->getLinkage(),
1250 Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1251 Var->setDSOLocal(F->isDSOLocal());
1252 Var->setVisibility(F->getVisibility());
1253 auto *FD = cast<FunctionDecl>(GD.
getDecl());
1254 auto *FT = FD->getPrimaryTemplate();
1255 if (!FT || FT->isThisDeclarationADefinition())
1256 CGM.maybeSetTrivialComdat(*FD, *Var);
1257 KernelHandles[F->getName()] = Var;
1258 KernelStubs[Var] = F;
static void replaceManagedVar(llvm::GlobalVariable *Var, llvm::GlobalVariable *ManagedVar)
static std::unique_ptr< MangleContext > InitDeviceMC(CodeGenModule &CGM)
MangleContext * createMangleContext(const TargetInfo *T=nullptr)
If T is null pointer, assume the target in ASTContext.
MangleContext * createDeviceMangleContext(const TargetInfo &T)
Creates a device mangle context to correctly mangle lambdas in a mixed architecture compile by settin...
const TargetInfo & getTargetInfo() const
const TargetInfo * getAuxTargetInfo() const
CharUnits - This is an opaque type for sizes expressed in character units.
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
llvm::Value * emitRawPointer(CodeGenFunction &CGF) const
Return the pointer contained in this class after authenticating it and adding offset to it if necessa...
llvm::PointerType * getType() const
Return the type of the pointer value.
llvm::StoreInst * CreateAlignedStore(llvm::Value *Val, llvm::Value *Addr, CharUnits Align, bool IsVolatile=false)
llvm::StoreInst * CreateDefaultAlignedStore(llvm::Value *Val, llvm::Value *Addr, bool IsVolatile=false)
llvm::LoadInst * CreateLoad(Address Addr, const llvm::Twine &Name="")
CGFunctionInfo - Class to encapsulate the information about a function definition.
CallArgList - Type for representing both the value and type of arguments in a call.
void add(RValue rvalue, QualType type)
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
llvm::CallBase * EmitRuntimeCallOrInvoke(llvm::FunctionCallee callee, ArrayRef< llvm::Value * > args, const Twine &name="")
Emits a call or invoke instruction to the given runtime function.
llvm::BasicBlock * createBasicBlock(const Twine &name="", llvm::Function *parent=nullptr, llvm::BasicBlock *before=nullptr)
createBasicBlock - Create an LLVM basic block.
RValue EmitCall(const CGFunctionInfo &CallInfo, const CGCallee &Callee, ReturnValueSlot ReturnValue, const CallArgList &Args, llvm::CallBase **callOrInvoke, bool IsMustTail, SourceLocation Loc)
EmitCall - Generate a call of the given function, expecting the given result type,...
llvm::AllocaInst * CreateTempAlloca(llvm::Type *Ty, const Twine &Name="tmp", llvm::Value *ArraySize=nullptr)
CreateTempAlloca - This creates an alloca and inserts it into the entry block if ArraySize is nullptr...
const Decl * CurFuncDecl
CurFuncDecl - Holds the Decl for the current outermost non-closure context.
void EmitBranch(llvm::BasicBlock *Block)
EmitBranch - Emit a branch to the specified basic block from the current insert block,...
RawAddress CreateMemTemp(QualType T, const Twine &Name="tmp", RawAddress *Alloca=nullptr)
CreateMemTemp - Create a temporary memory object of the given type, with appropriate alignmen and cas...
Address GetAddrOfLocalVar(const VarDecl *VD)
GetAddrOfLocalVar - Return the address of a local variable.
void EmitBlock(llvm::BasicBlock *BB, bool IsFinished=false)
EmitBlock - Emit the given block.
const LangOptions & getLangOpts() const
This class organizes the cross-function state that is used while generating LLVM code.
ASTContext & getContext() const
The standard implementation of ConstantInitBuilder used in Clang.
FunctionArgList - Type for representing both the decl and type of parameters to a function.
ReturnValueSlot - Contains the address where the return value of a function can be stored,...
DeclContext - This is used only as base class of specific decl types that can act as declaration cont...
lookup_result lookup(DeclarationName Name) const
lookup - Find the declarations (if any) with the given Name in this context.
Decl - This represents one declaration (or definition), e.g.
SourceLocation getLocation() const
TranslationUnitDecl * getTranslationUnitDecl()
Represents a function declaration or definition.
const ParmVarDecl * getParamDecl(unsigned i) const
GlobalDecl - represents a global declaration.
GlobalDecl getWithKernelReferenceKind(KernelReferenceKind Kind)
const Decl * getDecl() const
One of these records is kept for each identifier that is lexed.
StringRef getName() const
Return the actual identifier string.
GPUDefaultStreamKind GPUDefaultStream
The default stream kind used for HIP kernel launching.
MangleContext - Context for tracking state which persists across multiple calls to the C++ name mangl...
bool shouldMangleDeclName(const NamedDecl *D)
void mangleName(GlobalDecl GD, raw_ostream &)
This represents a decl that may have a name.
IdentifierInfo * getIdentifier() const
Get the identifier that names this declaration, if there is one.
Represents a parameter to a function.
A (possibly-)qualified type.
QualType getCanonicalType() const
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
RecordDecl * getDecl() const
bool isMicrosoft() const
Is this ABI an MSVC-compatible ABI?
bool isItaniumFamily() const
Does this ABI generally fall into the Itanium family of ABIs?
TargetCXXABI getCXXABI() const
Get the C++ ABI currently in use.
A template argument list.
unsigned size() const
Retrieve the number of template arguments in this template argument list.
The top declaration context.
The base class of the type hierarchy.
const T * castAs() const
Member-template castAs<specific type>.
bool isCUDADeviceBuiltinSurfaceType() const
Check if the type is the CUDA device builtin surface type.
bool isCUDADeviceBuiltinTextureType() const
Check if the type is the CUDA device builtin texture type.
Represents a variable declaration or definition.
bool isInline() const
Whether this variable is (C++1z) inline.
bool hasExternalStorage() const
Returns true if a variable has extern or private_extern storage.
DefinitionKind hasDefinition(ASTContext &) const
Check whether this variable is defined in this translation unit.
CGCUDARuntime * CreateNVCUDARuntime(CodeGenModule &CGM)
Creates an instance of a CUDA runtime class.
constexpr XRayInstrMask None
bool Zero(InterpState &S, CodePtr OpPC)
std::unique_ptr< DiagnosticConsumer > create(StringRef OutputFile, DiagnosticOptions *Diags, bool MergeChildRecords=false)
Returns a DiagnosticConsumer that serializes diagnostics to a bitcode file.
The JSON file list parser is used to communicate input to InstallAPI.
CudaVersion ToCudaVersion(llvm::VersionTuple)
bool CudaFeatureEnabled(llvm::VersionTuple, CudaFeature)
Linkage
Describes the different kinds of linkage (C++ [basic.link], C99 6.2.2) that an entity may have.
@ HiddenVisibility
Objects with "hidden" visibility are not seen by the dynamic linker.
llvm::IntegerType * SizeTy
llvm::IntegerType * IntTy
int
llvm::PointerType * UnqualPtrTy