llvm-docs/clang_doxygen/CodeGen_2Targets_2AMDGPU_8cpp_source.html

 //===- AMDGPU.cpp ---------------------------------------------------------===//

 //

 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

 // See https://llvm.org/LICENSE.txt for license information.

 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

 //

 //===----------------------------------------------------------------------===//


 #include "ABIInfoImpl.h"

 #include "TargetInfo.h"

 #include "clang/Basic/TargetOptions.h"


 using namespace clang;

 using namespace clang::CodeGen;


 //===----------------------------------------------------------------------===//

 // AMDGPU ABI Implementation

 //===----------------------------------------------------------------------===//


 namespace {


 class AMDGPUABIInfo final : public DefaultABIInfo {

 private:

   static const unsigned MaxNumRegsForArgsRet = 16;


   unsigned numRegsForType(QualType Ty) const;


   bool isHomogeneousAggregateBaseType(QualType Ty) const override;

   bool isHomogeneousAggregateSmallEnough(const Type *Base,

                                          uint64_t Members) const override;


   // Coerce HIP scalar pointer arguments from generic pointers to global ones.

   llvm::Type *coerceKernelArgumentType(llvm::Type *Ty, unsigned FromAS,

                                        unsigned ToAS) const {

     // Single value types.

     auto *PtrTy = llvm::dyn_cast<llvm::PointerType>(Ty);

     if (PtrTy && PtrTy->getAddressSpace() == FromAS)

       return llvm::PointerType::get(Ty->getContext(), ToAS);

     return Ty;

   }


 public:

   explicit AMDGPUABIInfo(CodeGen::CodeGenTypes &CGT) :

     DefaultABIInfo(CGT) {}


   ABIArgInfo classifyReturnType(QualType RetTy) const;

   ABIArgInfo classifyKernelArgumentType(QualType Ty) const;

   ABIArgInfo classifyArgumentType(QualType Ty, unsigned &NumRegsLeft) const;


   void computeInfo(CGFunctionInfo &FI) const override;

   Address EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,

                     QualType Ty) const override;

 };


 bool AMDGPUABIInfo::isHomogeneousAggregateBaseType(QualType Ty) const {

   return true;

 }


 bool AMDGPUABIInfo::isHomogeneousAggregateSmallEnough(

   const Type *Base, uint64_t Members) const {

   uint32_t NumRegs = (getContext().getTypeSize(Base) + 31) / 32;


   // Homogeneous Aggregates may occupy at most 16 registers.

   return Members * NumRegs <= MaxNumRegsForArgsRet;

 }


 /// Estimate number of registers the type will use when passed in registers.

 unsigned AMDGPUABIInfo::numRegsForType(QualType Ty) const {

   unsigned NumRegs = 0;


   if (const VectorType *VT = Ty->getAs<VectorType>()) {

     // Compute from the number of elements. The reported size is based on the

     // in-memory size, which includes the padding 4th element for 3-vectors.

     QualType EltTy = VT->getElementType();

     unsigned EltSize = getContext().getTypeSize(EltTy);


     // 16-bit element vectors should be passed as packed.

     if (EltSize == 16)

       return (VT->getNumElements() + 1) / 2;


     unsigned EltNumRegs = (EltSize + 31) / 32;

     return EltNumRegs * VT->getNumElements();

   }


   if (const RecordType *RT = Ty->getAs<RecordType>()) {

     const RecordDecl *RD = RT->getDecl();

     assert(!RD->hasFlexibleArrayMember());


     for (const FieldDecl *Field : RD->fields()) {

       QualType FieldTy = Field->getType();

       NumRegs += numRegsForType(FieldTy);

     }


     return NumRegs;

   }


   return (getContext().getTypeSize(Ty) + 31) / 32;

 }


 void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {

   llvm::CallingConv::ID CC = FI.getCallingConvention();


   if (!getCXXABI().classifyReturnType(FI))

     FI.getReturnInfo() = classifyReturnType(FI.getReturnType());


   unsigned NumRegsLeft = MaxNumRegsForArgsRet;

   for (auto &Arg : FI.arguments()) {

     if (CC == llvm::CallingConv::AMDGPU_KERNEL) {

       Arg.info = classifyKernelArgumentType(Arg.type);

     } else {

       Arg.info = classifyArgumentType(Arg.type, NumRegsLeft);

     }

   }

 }


 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,

                                  QualType Ty) const {

   llvm_unreachable("AMDGPU does not support varargs");

 }


 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {

   if (isAggregateTypeForABI(RetTy)) {

     // Records with non-trivial destructors/copy-constructors should not be

     // returned by value.

     if (!getRecordArgABI(RetTy, getCXXABI())) {

       // Ignore empty structs/unions.

       if (isEmptyRecord(getContext(), RetTy, true))

         return ABIArgInfo::getIgnore();


       // Lower single-element structs to just return a regular value.

       if (const Type *SeltTy = isSingleElementStruct(RetTy, getContext()))

         return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));


       if (const RecordType *RT = RetTy->getAs<RecordType>()) {

         const RecordDecl *RD = RT->getDecl();

         if (RD->hasFlexibleArrayMember())

           return DefaultABIInfo::classifyReturnType(RetTy);

       }


       // Pack aggregates <= 4 bytes into single VGPR or pair.

       uint64_t Size = getContext().getTypeSize(RetTy);

       if (Size <= 16)

         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));


       if (Size <= 32)

         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));


       if (Size <= 64) {

         llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());

         return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));

       }


       if (numRegsForType(RetTy) <= MaxNumRegsForArgsRet)

         return ABIArgInfo::getDirect();

     }

   }


   // Otherwise just do the default thing.

   return DefaultABIInfo::classifyReturnType(RetTy);

 }


 /// For kernels all parameters are really passed in a special buffer. It doesn't

 /// make sense to pass anything byval, so everything must be direct.

 ABIArgInfo AMDGPUABIInfo::classifyKernelArgumentType(QualType Ty) const {

   Ty = useFirstFieldIfTransparentUnion(Ty);


   // TODO: Can we omit empty structs?


   if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))

     Ty = QualType(SeltTy, 0);


   llvm::Type *OrigLTy = CGT.ConvertType(Ty);

   llvm::Type *LTy = OrigLTy;

   if (getContext().getLangOpts().HIP) {

     LTy = coerceKernelArgumentType(

         OrigLTy, /*FromAS=*/getContext().getTargetAddressSpace(LangAS::Default),

         /*ToAS=*/getContext().getTargetAddressSpace(LangAS::cuda_device));

   }


   // FIXME: Should also use this for OpenCL, but it requires addressing the

   // problem of kernels being called.

   //

   // FIXME: This doesn't apply the optimization of coercing pointers in structs

   // to global address space when using byref. This would require implementing a

   // new kind of coercion of the in-memory type when for indirect arguments.

   if (!getContext().getLangOpts().OpenCL && LTy == OrigLTy &&

       isAggregateTypeForABI(Ty)) {

     return ABIArgInfo::getIndirectAliased(

         getContext().getTypeAlignInChars(Ty),

         getContext().getTargetAddressSpace(LangAS::opencl_constant),

         false /*Realign*/, nullptr /*Padding*/);

   }


   // If we set CanBeFlattened to true, CodeGen will expand the struct to its

   // individual elements, which confuses the Clover OpenCL backend; therefore we

   // have to set it to false here. Other args of getDirect() are just defaults.

   return ABIArgInfo::getDirect(LTy, 0, nullptr, false);

 }


 ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty,

                                                unsigned &NumRegsLeft) const {

   assert(NumRegsLeft <= MaxNumRegsForArgsRet && "register estimate underflow");


   Ty = useFirstFieldIfTransparentUnion(Ty);


   if (isAggregateTypeForABI(Ty)) {

     // Records with non-trivial destructors/copy-constructors should not be

     // passed by value.

     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))

       return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);


     // Ignore empty structs/unions.

     if (isEmptyRecord(getContext(), Ty, true))

       return ABIArgInfo::getIgnore();


     // Lower single-element structs to just pass a regular value. TODO: We

     // could do reasonable-size multiple-element structs too, using getExpand(),

     // though watch out for things like bitfields.

     if (const Type *SeltTy = isSingleElementStruct(Ty, getContext()))

       return ABIArgInfo::getDirect(CGT.ConvertType(QualType(SeltTy, 0)));


     if (const RecordType *RT = Ty->getAs<RecordType>()) {

       const RecordDecl *RD = RT->getDecl();

       if (RD->hasFlexibleArrayMember())

         return DefaultABIInfo::classifyArgumentType(Ty);

     }


     // Pack aggregates <= 8 bytes into single VGPR or pair.

     uint64_t Size = getContext().getTypeSize(Ty);

     if (Size <= 64) {

       unsigned NumRegs = (Size + 31) / 32;

       NumRegsLeft -= std::min(NumRegsLeft, NumRegs);


       if (Size <= 16)

         return ABIArgInfo::getDirect(llvm::Type::getInt16Ty(getVMContext()));


       if (Size <= 32)

         return ABIArgInfo::getDirect(llvm::Type::getInt32Ty(getVMContext()));


       // XXX: Should this be i64 instead, and should the limit increase?

       llvm::Type *I32Ty = llvm::Type::getInt32Ty(getVMContext());

       return ABIArgInfo::getDirect(llvm::ArrayType::get(I32Ty, 2));

     }


     if (NumRegsLeft > 0) {

       unsigned NumRegs = numRegsForType(Ty);

       if (NumRegsLeft >= NumRegs) {

         NumRegsLeft -= NumRegs;

         return ABIArgInfo::getDirect();

       }

     }


     // Use pass-by-reference in stead of pass-by-value for struct arguments in

     // function ABI.

     return ABIArgInfo::getIndirectAliased(

         getContext().getTypeAlignInChars(Ty),

         getContext().getTargetAddressSpace(LangAS::opencl_private));

   }


   // Otherwise just do the default thing.

   ABIArgInfo ArgInfo = DefaultABIInfo::classifyArgumentType(Ty);

   if (!ArgInfo.isIndirect()) {

     unsigned NumRegs = numRegsForType(Ty);

     NumRegsLeft -= std::min(NumRegs, NumRegsLeft);

   }


   return ArgInfo;

 }


 class AMDGPUTargetCodeGenInfo : public TargetCodeGenInfo {

 public:

   AMDGPUTargetCodeGenInfo(CodeGenTypes &CGT)

       : TargetCodeGenInfo(std::make_unique<AMDGPUABIInfo>(CGT)) {}


   void setFunctionDeclAttributes(const FunctionDecl *FD, llvm::Function *F,

                                  CodeGenModule &CGM) const;


   void emitTargetGlobals(CodeGen::CodeGenModule &CGM) const override;


   void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,

                            CodeGen::CodeGenModule &M) const override;

   unsigned getOpenCLKernelCallingConv() const override;


   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,

       llvm::PointerType *T, QualType QT) const override;


   LangAS getASTAllocaAddressSpace() const override {

     return getLangASFromTargetAS(

         getABIInfo().getDataLayout().getAllocaAddrSpace());

   }

   LangAS getGlobalVarAddressSpace(CodeGenModule &CGM,

                                   const VarDecl *D) const override;

   llvm::SyncScope::ID getLLVMSyncScopeID(const LangOptions &LangOpts,

                                          SyncScope Scope,

                                          llvm::AtomicOrdering Ordering,

                                          llvm::LLVMContext &Ctx) const override;

   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,

                                          llvm::Function *BlockInvokeFunc,

                                          llvm::Type *BlockTy) const override;

   bool shouldEmitStaticExternCAliases() const override;

   bool shouldEmitDWARFBitFieldSeparators() const override;

   void setCUDAKernelCallingConvention(const FunctionType *&FT) const override;


 private:

   // Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the

   // resulting MDNode to the amdgcn.annotations MDNode.

   static void addAMDGCNMetadata(llvm::GlobalValue *GV, StringRef Name,

                                 int Operand);

 };

 }


 static bool requiresAMDGPUProtectedVisibility(const Decl *D,

                                               llvm::GlobalValue *GV) {

   if (GV->getVisibility() != llvm::GlobalValue::HiddenVisibility)

     return false;


   return !D->hasAttr<OMPDeclareTargetDeclAttr>() &&

          (D->hasAttr<OpenCLKernelAttr>() ||

           (isa<FunctionDecl>(D) && D->hasAttr<CUDAGlobalAttr>()) ||

           (isa<VarDecl>(D) &&

            (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>() ||

             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinSurfaceType() ||

             cast<VarDecl>(D)->getType()->isCUDADeviceBuiltinTextureType())));

 }


 void AMDGPUTargetCodeGenInfo::setFunctionDeclAttributes(

     const FunctionDecl *FD, llvm::Function *F, CodeGenModule &M) const {

   const auto *ReqdWGS =

       M.getLangOpts().OpenCL ? FD->getAttr<ReqdWorkGroupSizeAttr>() : nullptr;

   const bool IsOpenCLKernel =

       M.getLangOpts().OpenCL && FD->hasAttr<OpenCLKernelAttr>();

   const bool IsHIPKernel = M.getLangOpts().HIP && FD->hasAttr<CUDAGlobalAttr>();


   const auto *FlatWGS = FD->getAttr<AMDGPUFlatWorkGroupSizeAttr>();

   if (ReqdWGS || FlatWGS) {

     M.handleAMDGPUFlatWorkGroupSizeAttr(F, FlatWGS, ReqdWGS);

   } else if (IsOpenCLKernel || IsHIPKernel) {

     // By default, restrict the maximum size to a value specified by

     // --gpu-max-threads-per-block=n or its default value for HIP.

     const unsigned OpenCLDefaultMaxWorkGroupSize = 256;

     const unsigned DefaultMaxWorkGroupSize =

         IsOpenCLKernel ? OpenCLDefaultMaxWorkGroupSize

                        : M.getLangOpts().GPUMaxThreadsPerBlock;

     std::string AttrVal =

         std::string("1,") + llvm::utostr(DefaultMaxWorkGroupSize);

     F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);

   }


   if (const auto *Attr = FD->getAttr<AMDGPUWavesPerEUAttr>())

     M.handleAMDGPUWavesPerEUAttr(F, Attr);


   if (const auto *Attr = FD->getAttr<AMDGPUNumSGPRAttr>()) {

     unsigned NumSGPR = Attr->getNumSGPR();


     if (NumSGPR != 0)

       F->addFnAttr("amdgpu-num-sgpr", llvm::utostr(NumSGPR));

   }


   if (const auto *Attr = FD->getAttr<AMDGPUNumVGPRAttr>()) {

     uint32_t NumVGPR = Attr->getNumVGPR();


     if (NumVGPR != 0)

       F->addFnAttr("amdgpu-num-vgpr", llvm::utostr(NumVGPR));

   }


   if (const auto *Attr = FD->getAttr<AMDGPUMaxNumWorkGroupsAttr>()) {

     uint32_t X = Attr->getMaxNumWorkGroupsX()

                      ->EvaluateKnownConstInt(M.getContext())

                      .getExtValue();

     // Y and Z dimensions default to 1 if not specified

     uint32_t Y = Attr->getMaxNumWorkGroupsY()

                      ? Attr->getMaxNumWorkGroupsY()

                            ->EvaluateKnownConstInt(M.getContext())

                            .getExtValue()

                      : 1;

     uint32_t Z = Attr->getMaxNumWorkGroupsZ()

                      ? Attr->getMaxNumWorkGroupsZ()

                            ->EvaluateKnownConstInt(M.getContext())

                            .getExtValue()

                      : 1;


     llvm::SmallString<32> AttrVal;

     llvm::raw_svector_ostream OS(AttrVal);

     OS << X << ',' << Y << ',' << Z;


     F->addFnAttr("amdgpu-max-num-workgroups", AttrVal.str());

   }

 }


 /// Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV,

 /// Name, and Operand as operands, and adds the resulting MDNode to the

 /// AnnotationName MDNode.

 static void addAMDGCOrNVVMMetadata(const char *AnnotationName,

                                    llvm::GlobalValue *GV, StringRef Name,

                                    int Operand) {

   llvm::Module *M = GV->getParent();

   llvm::LLVMContext &Ctx = M->getContext();


   // Get annotations metadata node.

   llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata(AnnotationName);


   llvm::Metadata *MDVals[] = {

       llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),

       llvm::ConstantAsMetadata::get(

           llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};

   // Append metadata to annotations node.

   MD->addOperand(llvm::MDNode::get(Ctx, MDVals));

 }


 void AMDGPUTargetCodeGenInfo::addAMDGCNMetadata(llvm::GlobalValue *GV,

                                                 StringRef Name, int Operand) {

   addAMDGCOrNVVMMetadata("amdgcn.annotations", GV, Name, Operand);

 }


 /// Emits control constants used to change per-architecture behaviour in the

 /// AMDGPU ROCm device libraries.

 void AMDGPUTargetCodeGenInfo::emitTargetGlobals(

     CodeGen::CodeGenModule &CGM) const {

   StringRef Name = "__oclc_ABI_version";

   llvm::GlobalVariable *OriginalGV = CGM.getModule().getNamedGlobal(Name);

   if (OriginalGV && !llvm::GlobalVariable::isExternalLinkage(OriginalGV->getLinkage()))

     return;


   if (CGM.getTarget().getTargetOpts().CodeObjectVersion ==

       llvm::CodeObjectVersionKind::COV_None)

     return;


   auto *Type = llvm::IntegerType::getIntNTy(CGM.getModule().getContext(), 32);

   llvm::Constant *COV = llvm::ConstantInt::get(

       Type, CGM.getTarget().getTargetOpts().CodeObjectVersion);


   // It needs to be constant weak_odr without externally_initialized so that

   // the load instuction can be eliminated by the IPSCCP.

   auto *GV = new llvm::GlobalVariable(

       CGM.getModule(), Type, true, llvm::GlobalValue::WeakODRLinkage, COV, Name,

       nullptr, llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,

       CGM.getContext().getTargetAddressSpace(LangAS::opencl_constant));

   GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);

   GV->setVisibility(llvm::GlobalValue::VisibilityTypes::HiddenVisibility);


   // Replace any external references to this variable with the new global.

   if (OriginalGV) {

     OriginalGV->replaceAllUsesWith(GV);

     GV->takeName(OriginalGV);

     OriginalGV->eraseFromParent();

   }

 }


 void AMDGPUTargetCodeGenInfo::setTargetAttributes(

     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {

   if (requiresAMDGPUProtectedVisibility(D, GV)) {

     GV->setVisibility(llvm::GlobalValue::ProtectedVisibility);

     GV->setDSOLocal(true);

   }


   if (GV->isDeclaration())

     return;


   llvm::Function *F = dyn_cast<llvm::Function>(GV);

   if (!F)

     return;


   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);

   if (FD)

     setFunctionDeclAttributes(FD, F, M);


   // Create !{<func-ref>, metadata !"kernel", i32 1} node for SYCL kernels.

   const bool IsSYCLKernel =

       FD && M.getLangOpts().SYCLIsDevice && FD->hasAttr<SYCLKernelAttr>();

   if (IsSYCLKernel)

     addAMDGCNMetadata(F, "kernel", 1);


   if (M.getContext().getTargetInfo().allowAMDGPUUnsafeFPAtomics())

     F->addFnAttr("amdgpu-unsafe-fp-atomics", "true");


   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)

     F->addFnAttr("amdgpu-ieee", "false");

 }


 unsigned AMDGPUTargetCodeGenInfo::getOpenCLKernelCallingConv() const {

   return llvm::CallingConv::AMDGPU_KERNEL;

 }


 // Currently LLVM assumes null pointers always have value 0,

 // which results in incorrectly transformed IR. Therefore, instead of

 // emitting null pointers in private and local address spaces, a null

 // pointer in generic address space is emitted which is casted to a

 // pointer in local or private address space.

 llvm::Constant *AMDGPUTargetCodeGenInfo::getNullPointer(

     const CodeGen::CodeGenModule &CGM, llvm::PointerType *PT,

     QualType QT) const {

   if (CGM.getContext().getTargetNullPointerValue(QT) == 0)

     return llvm::ConstantPointerNull::get(PT);


   auto &Ctx = CGM.getContext();

   auto NPT = llvm::PointerType::get(

       PT->getContext(), Ctx.getTargetAddressSpace(LangAS::opencl_generic));

   return llvm::ConstantExpr::getAddrSpaceCast(

       llvm::ConstantPointerNull::get(NPT), PT);

 }


 LangAS

 AMDGPUTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,

                                                   const VarDecl *D) const {

   assert(!CGM.getLangOpts().OpenCL &&

          !(CGM.getLangOpts().CUDA && CGM.getLangOpts().CUDAIsDevice) &&

          "Address space agnostic languages only");

   LangAS DefaultGlobalAS = getLangASFromTargetAS(

       CGM.getContext().getTargetAddressSpace(LangAS::opencl_global));

   if (!D)

     return DefaultGlobalAS;


   LangAS AddrSpace = D->getType().getAddressSpace();

   if (AddrSpace != LangAS::Default)

     return AddrSpace;


   // Only promote to address space 4 if VarDecl has constant initialization.

   if (D->getType().isConstantStorage(CGM.getContext(), false, false) &&

       D->hasConstantInitialization()) {

     if (auto ConstAS = CGM.getTarget().getConstantAddressSpace())

       return *ConstAS;

   }

   return DefaultGlobalAS;

 }


 llvm::SyncScope::ID

 AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,

                                             SyncScope Scope,

                                             llvm::AtomicOrdering Ordering,

                                             llvm::LLVMContext &Ctx) const {

   std::string Name;

   switch (Scope) {

   case SyncScope::HIPSingleThread:

   case SyncScope::SingleScope:

     Name = "singlethread";

     break;

   case SyncScope::HIPWavefront:

   case SyncScope::OpenCLSubGroup:

   case SyncScope::WavefrontScope:

     Name = "wavefront";

     break;

   case SyncScope::HIPWorkgroup:

   case SyncScope::OpenCLWorkGroup:

   case SyncScope::WorkgroupScope:

     Name = "workgroup";

     break;

   case SyncScope::HIPAgent:

   case SyncScope::OpenCLDevice:

   case SyncScope::DeviceScope:

     Name = "agent";

     break;

   case SyncScope::SystemScope:

   case SyncScope::HIPSystem:

   case SyncScope::OpenCLAllSVMDevices:

     Name = "";

     break;

   }


   if (Ordering != llvm::AtomicOrdering::SequentiallyConsistent) {

     if (!Name.empty())

       Name = Twine(Twine(Name) + Twine("-")).str();


     Name = Twine(Twine(Name) + Twine("one-as")).str();

   }


   return Ctx.getOrInsertSyncScopeID(Name);

 }


 bool AMDGPUTargetCodeGenInfo::shouldEmitStaticExternCAliases() const {

   return false;

 }


 bool AMDGPUTargetCodeGenInfo::shouldEmitDWARFBitFieldSeparators() const {

   return true;

 }


 void AMDGPUTargetCodeGenInfo::setCUDAKernelCallingConvention(

     const FunctionType *&FT) const {

   FT = getABIInfo().getContext().adjustFunctionType(

       FT, FT->getExtInfo().withCallingConv(CC_OpenCLKernel));

 }


 /// Create an OpenCL kernel for an enqueued block.

 ///

 /// The type of the first argument (the block literal) is the struct type

 /// of the block literal instead of a pointer type. The first argument

 /// (block literal) is passed directly by value to the kernel. The kernel

 /// allocates the same type of struct on stack and stores the block literal

 /// to it and passes its pointer to the block invoke function. The kernel

 /// has "enqueued-block" function attribute and kernel argument metadata.

 llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(

     CodeGenFunction &CGF, llvm::Function *Invoke, llvm::Type *BlockTy) const {

   auto &Builder = CGF.Builder;

   auto &C = CGF.getLLVMContext();


   auto *InvokeFT = Invoke->getFunctionType();

   llvm::SmallVector<llvm::Type *, 2> ArgTys;

   llvm::SmallVector<llvm::Metadata *, 8> AddressQuals;

   llvm::SmallVector<llvm::Metadata *, 8> AccessQuals;

   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeNames;

   llvm::SmallVector<llvm::Metadata *, 8> ArgBaseTypeNames;

   llvm::SmallVector<llvm::Metadata *, 8> ArgTypeQuals;

   llvm::SmallVector<llvm::Metadata *, 8> ArgNames;


   ArgTys.push_back(BlockTy);

   ArgTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));

   AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(0)));

   ArgBaseTypeNames.push_back(llvm::MDString::get(C, "__block_literal"));

   ArgTypeQuals.push_back(llvm::MDString::get(C, ""));

   AccessQuals.push_back(llvm::MDString::get(C, "none"));

   ArgNames.push_back(llvm::MDString::get(C, "block_literal"));

   for (unsigned I = 1, E = InvokeFT->getNumParams(); I < E; ++I) {

     ArgTys.push_back(InvokeFT->getParamType(I));

     ArgTypeNames.push_back(llvm::MDString::get(C, "void*"));

     AddressQuals.push_back(llvm::ConstantAsMetadata::get(Builder.getInt32(3)));

     AccessQuals.push_back(llvm::MDString::get(C, "none"));

     ArgBaseTypeNames.push_back(llvm::MDString::get(C, "void*"));

     ArgTypeQuals.push_back(llvm::MDString::get(C, ""));

     ArgNames.push_back(

         llvm::MDString::get(C, (Twine("local_arg") + Twine(I)).str()));

   }

   std::string Name = Invoke->getName().str() + "_kernel";

   auto *FT = llvm::FunctionType::get(llvm::Type::getVoidTy(C), ArgTys, false);

   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,

                                    &CGF.CGM.getModule());

   F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);


   llvm::AttrBuilder KernelAttrs(C);

   // FIXME: The invoke isn't applying the right attributes either

   // FIXME: This is missing setTargetAttributes

   CGF.CGM.addDefaultFunctionDefinitionAttributes(KernelAttrs);

   KernelAttrs.addAttribute("enqueued-block");

   F->addFnAttrs(KernelAttrs);


   auto IP = CGF.Builder.saveIP();

   auto *BB = llvm::BasicBlock::Create(C, "entry", F);

   Builder.SetInsertPoint(BB);

   const auto BlockAlign = CGF.CGM.getDataLayout().getPrefTypeAlign(BlockTy);

   auto *BlockPtr = Builder.CreateAlloca(BlockTy, nullptr);

   BlockPtr->setAlignment(BlockAlign);

   Builder.CreateAlignedStore(F->arg_begin(), BlockPtr, BlockAlign);

   auto *Cast = Builder.CreatePointerCast(BlockPtr, InvokeFT->getParamType(0));

   llvm::SmallVector<llvm::Value *, 2> Args;

   Args.push_back(Cast);

   for (llvm::Argument &A : llvm::drop_begin(F->args()))

     Args.push_back(&A);

   llvm::CallInst *call = Builder.CreateCall(Invoke, Args);

   call->setCallingConv(Invoke->getCallingConv());

   Builder.CreateRetVoid();

   Builder.restoreIP(IP);


   F->setMetadata("kernel_arg_addr_space", llvm::MDNode::get(C, AddressQuals));

   F->setMetadata("kernel_arg_access_qual", llvm::MDNode::get(C, AccessQuals));

   F->setMetadata("kernel_arg_type", llvm::MDNode::get(C, ArgTypeNames));

   F->setMetadata("kernel_arg_base_type",

                  llvm::MDNode::get(C, ArgBaseTypeNames));

   F->setMetadata("kernel_arg_type_qual", llvm::MDNode::get(C, ArgTypeQuals));

   if (CGF.CGM.getCodeGenOpts().EmitOpenCLArgMetadata)

     F->setMetadata("kernel_arg_name", llvm::MDNode::get(C, ArgNames));


   return F;

 }


 void CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr(

     llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *FlatWGS,

     const ReqdWorkGroupSizeAttr *ReqdWGS, int32_t *MinThreadsVal,

     int32_t *MaxThreadsVal) {

   unsigned Min = 0;

   unsigned Max = 0;

   if (FlatWGS) {

     Min = FlatWGS->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();

     Max = FlatWGS->getMax()->EvaluateKnownConstInt(getContext()).getExtValue();

   }

   if (ReqdWGS && Min == 0 && Max == 0)

     Min = Max = ReqdWGS->getXDim() * ReqdWGS->getYDim() * ReqdWGS->getZDim();


   if (Min != 0) {

     assert(Min <= Max && "Min must be less than or equal Max");


     if (MinThreadsVal)

       *MinThreadsVal = Min;

     if (MaxThreadsVal)

       *MaxThreadsVal = Max;

     std::string AttrVal = llvm::utostr(Min) + "," + llvm::utostr(Max);

     if (F)

       F->addFnAttr("amdgpu-flat-work-group-size", AttrVal);

   } else

     assert(Max == 0 && "Max must be zero");

 }


 void CodeGenModule::handleAMDGPUWavesPerEUAttr(

     llvm::Function *F, const AMDGPUWavesPerEUAttr *Attr) {

   unsigned Min =

       Attr->getMin()->EvaluateKnownConstInt(getContext()).getExtValue();

   unsigned Max =

       Attr->getMax()

           ? Attr->getMax()->EvaluateKnownConstInt(getContext()).getExtValue()

           : 0;


   if (Min != 0) {

     assert((Max == 0 || Min <= Max) && "Min must be less than or equal Max");


     std::string AttrVal = llvm::utostr(Min);

     if (Max != 0)

       AttrVal = AttrVal + "," + llvm::utostr(Max);

     F->addFnAttr("amdgpu-waves-per-eu", AttrVal);

   } else

     assert(Max == 0 && "Max must be zero");

 }


 std::unique_ptr<TargetCodeGenInfo>

 CodeGen::createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM) {

   return std::make_unique<AMDGPUTargetCodeGenInfo>(CGM.getTypes());

 }

ABIInfoImpl.h

ID
static char ID
Definition: Arena.cpp:183

setCUDAKernelCallingConvention
static void setCUDAKernelCallingConvention(CanQualType &FTy, CodeGenModule &CGM, const FunctionDecl *FD)
Set calling convention for CUDA/HIP kernel.
Definition: CGCall.cpp:295

requiresAMDGPUProtectedVisibility
static bool requiresAMDGPUProtectedVisibility(const Decl *D, llvm::GlobalValue *GV)
Definition: AMDGPU.cpp:312

addAMDGCOrNVVMMetadata
static void addAMDGCOrNVVMMetadata(const char *AnnotationName, llvm::GlobalValue *GV, StringRef Name, int Operand)
Helper function for AMDGCN and NVVM targets, adds a NamedMDNode with GV, Name, and Operand as operand...
Definition: AMDGPU.cpp:393

X
#define X(type, name)
Definition: Value.h:143

TargetOptions.h
Defines the clang::TargetOptions class.

min
__DEVICE__ int min(int __a, int __b)
Definition: __clang_cuda_math.h:197

Base

clang::ASTContext::getTargetNullPointerValue
uint64_t getTargetNullPointerValue(QualType QT) const
Get target-dependent integer value for null pointer which is used for constant folding.
Definition: ASTContext.cpp:12525

clang::ASTContext::getTargetInfo
const TargetInfo & getTargetInfo() const
Definition: ASTContext.h:760

clang::ASTContext::getTargetAddressSpace
unsigned getTargetAddressSpace(LangAS AS) const
Definition: ASTContext.cpp:12535

clang::Attr
Attr - This represents one attribute.
Definition: Attr.h:46

clang::CodeGen::ABIArgInfo
ABIArgInfo - Helper class to encapsulate information about how a specific C type should be passed to ...
Definition: CGFunctionInfo.h:32

clang::CodeGen::ABIArgInfo::isIndirect
bool isIndirect() const
Definition: CGFunctionInfo.h:300

clang::CodeGen::ABIArgInfo::getIgnore
static ABIArgInfo getIgnore()
Definition: CGFunctionInfo.h:195

clang::CodeGen::ABIArgInfo::getDirect
static ABIArgInfo getDirect(llvm::Type *T=nullptr, unsigned Offset=0, llvm::Type *Padding=nullptr, bool CanBeFlattened=true, unsigned Align=0)
Definition: CGFunctionInfo.h:142

clang::CodeGen::ABIArgInfo::getIndirectAliased
static ABIArgInfo getIndirectAliased(CharUnits Alignment, unsigned AddrSpace, bool Realign=false, llvm::Type *Padding=nullptr)
Pass this in memory using the IR byref attribute.
Definition: CGFunctionInfo.h:211

clang::CodeGen::Address
Like RawAddress, an abstract representation of an aligned address, but the pointer contained in this ...
Definition: Address.h:111

clang::CodeGen::CGCXXABI::RAA_DirectInMemory
@ RAA_DirectInMemory
Pass it on the stack using its defined layout.
Definition: CGCXXABI.h:158

clang::CodeGen::CGFunctionInfo
CGFunctionInfo - Class to encapsulate the information about a function definition.
Definition: CGFunctionInfo.h:554

clang::CodeGen::CGFunctionInfo::getCallingConvention
unsigned getCallingConvention() const
getCallingConvention - Return the user specified calling convention, which has been translated into a...
Definition: CGFunctionInfo.h:707

clang::CodeGen::CGFunctionInfo::getReturnType
CanQualType getReturnType() const
Definition: CGFunctionInfo.h:728

clang::CodeGen::CGFunctionInfo::arguments
MutableArrayRef< ArgInfo > arguments()
Definition: CGFunctionInfo.h:659

clang::CodeGen::CGFunctionInfo::getReturnInfo
ABIArgInfo & getReturnInfo()
Definition: CGFunctionInfo.h:730

clang::CodeGen::CodeGenFunction
CodeGenFunction - This class organizes the per-function state that is used while generating LLVM code...
Definition: CodeGenFunction.h:258

clang::CodeGen::CodeGenFunction::getLLVMContext
llvm::LLVMContext & getLLVMContext()
Definition: CodeGenFunction.h:2203

clang::CodeGen::CodeGenFunction::Builder
CGBuilderTy Builder
Definition: CodeGenFunction.h:296

clang::CodeGen::CodeGenFunction::CGM
CodeGenModule & CGM
Definition: CodeGenFunction.h:288

clang::CodeGen::CodeGenModule
This class organizes the cross-function state that is used while generating LLVM code.
Definition: CodeGenModule.h:281

clang::CodeGen::CodeGenModule::getTarget
const TargetInfo & getTarget() const
Definition: CodeGenModule.h:776

clang::CodeGen::CodeGenModule::getTypes
CodeGenTypes & getTypes()
Definition: CodeGenModule.h:788

clang::CodeGen::CodeGenModule::getDataLayout
const llvm::DataLayout & getDataLayout() const
Definition: CodeGenModule.h:773

clang::CodeGen::CodeGenModule::handleAMDGPUWavesPerEUAttr
void handleAMDGPUWavesPerEUAttr(llvm::Function *F, const AMDGPUWavesPerEUAttr *A)
Emit the IR encoding to attach the AMD GPU waves-per-eu attribute to F.
Definition: AMDGPU.cpp:693

clang::CodeGen::CodeGenModule::getModule
llvm::Module & getModule() const
Definition: CodeGenModule.h:771

clang::CodeGen::CodeGenModule::getLangOpts
const LangOptions & getLangOpts() const
Definition: CodeGenModule.h:762

clang::CodeGen::CodeGenModule::handleAMDGPUFlatWorkGroupSizeAttr
void handleAMDGPUFlatWorkGroupSizeAttr(llvm::Function *F, const AMDGPUFlatWorkGroupSizeAttr *A, const ReqdWorkGroupSizeAttr *ReqdWGS=nullptr, int32_t *MinThreadsVal=nullptr, int32_t *MaxThreadsVal=nullptr)
Emit the IR encoding to attach the AMD GPU flat-work-group-size attribute to F.
Definition: AMDGPU.cpp:666

clang::CodeGen::CodeGenModule::getContext
ASTContext & getContext() const
Definition: CodeGenModule.h:761

clang::CodeGen::CodeGenModule::addDefaultFunctionDefinitionAttributes
void addDefaultFunctionDefinitionAttributes(llvm::AttrBuilder &attrs)
Like the overload taking a Function &, but intended specifically for frontends that want to build on ...
Definition: CGCall.cpp:2217

clang::CodeGen::CodeGenModule::getCodeGenOpts
const CodeGenOptions & getCodeGenOpts() const
Definition: CodeGenModule.h:770

clang::CodeGen::CodeGenTypes
This class organizes the cross-module state that is used while lowering AST types to LLVM types.
Definition: CodeGenTypes.h:54

clang::CodeGen::DefaultABIInfo
DefaultABIInfo - The default implementation for ABI specific details.
Definition: ABIInfoImpl.h:21

clang::CodeGen::DefaultABIInfo::classifyArgumentType
ABIArgInfo classifyArgumentType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:17

clang::CodeGen::DefaultABIInfo::classifyReturnType
ABIArgInfo classifyReturnType(QualType RetTy) const
Definition: ABIInfoImpl.cpp:45

clang::CodeGen::TargetCodeGenInfo
TargetCodeGenInfo - This class organizes various target-specific codegeneration issues,...
Definition: TargetInfo.h:46

clang::Decl
Decl - This represents one declaration (or definition), e.g.
Definition: DeclBase.h:86

clang::Decl::hasAttr
bool hasAttr() const
Definition: DeclBase.h:583

clang::Decl::getAttr
T * getAttr() const
Definition: DeclBase.h:579

clang::FieldDecl
Represents a member of a struct/union/class.
Definition: Decl.h:3060

clang::FunctionDecl
Represents a function declaration or definition.
Definition: Decl.h:1972

clang::FunctionType::ExtInfo::withCallingConv
ExtInfo withCallingConv(CallingConv cc) const
Definition: Type.h:4494

clang::FunctionType
FunctionType - C99 6.7.5.3 - Function Declarators.
Definition: Type.h:4268

clang::FunctionType::getExtInfo
ExtInfo getExtInfo() const
Definition: Type.h:4597

clang::LangOptions
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:482

clang::QualType
A (possibly-)qualified type.
Definition: Type.h:940

clang::QualType::getAddressSpace
LangAS getAddressSpace() const
Return the address space of this type.
Definition: Type.h:7497

clang::QualType::isConstantStorage
bool isConstantStorage(const ASTContext &Ctx, bool ExcludeCtor, bool ExcludeDtor)
Definition: Type.h:1039

clang::RecordDecl
Represents a struct/union/class.
Definition: Decl.h:4171

clang::RecordDecl::hasFlexibleArrayMember
bool hasFlexibleArrayMember() const
Definition: Decl.h:4204

clang::RecordDecl::fields
field_range fields() const
Definition: Decl.h:4377

clang::RecordType
A helper class that allows the use of isa/cast/dyncast to detect TagType objects of structs/unions/cl...
Definition: Type.h:5561

clang::Scope
Scope - A scope is a transient data structure that is used while parsing the program.
Definition: Scope.h:41

clang::TargetInfo::allowAMDGPUUnsafeFPAtomics
bool allowAMDGPUUnsafeFPAtomics() const
Returns whether or not the AMDGPU unsafe floating point atomics are allowed.
Definition: TargetInfo.h:1048

clang::TargetInfo::getConstantAddressSpace
virtual std::optional< LangAS > getConstantAddressSpace() const
Return an AST address space which can be used opportunistically for constant global memory.
Definition: TargetInfo.h:1632

clang::TargetInfo::getTargetOpts
TargetOptions & getTargetOpts() const
Retrieve the target options.
Definition: TargetInfo.h:312

clang::TargetOptions::CodeObjectVersion
llvm::CodeObjectVersionKind CodeObjectVersion
Code object version for AMDGPU.
Definition: TargetOptions.h:85

clang::Type
The base class of the type hierarchy.
Definition: Type.h:1813

clang::Type::getAs
const T * getAs() const
Member-template getAs<specific type>'.
Definition: Type.h:8160

clang::ValueDecl::getType
QualType getType() const
Definition: Decl.h:718

clang::VarDecl
Represents a variable declaration or definition.
Definition: Decl.h:919

clang::VarDecl::hasConstantInitialization
bool hasConstantInitialization() const
Determine whether this variable has constant initialization.
Definition: Decl.cpp:2628

clang::VectorType
Represents a GCC generic vector type.
Definition: Type.h:3981

llvm::SmallString< 32 >

llvm::SmallVector
Definition: LLVM.h:35

AttributeLangSupport::C
@ C
Definition: SemaDeclAttr.cpp:65

clang::CodeGen::swiftcall::classifyReturnType
ABIArgInfo classifyReturnType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to return a particular type.
Definition: SwiftCallingConv.cpp:856

clang::CodeGen::swiftcall::classifyArgumentType
ABIArgInfo classifyArgumentType(CodeGenModule &CGM, CanQualType type)
Classify the rules for how to pass a particular type.
Definition: SwiftCallingConv.cpp:860

clang::CodeGen
Definition: CGFunctionInfo.h:28

clang::CodeGen::getRecordArgABI
CGCXXABI::RecordArgABI getRecordArgABI(const RecordType *RT, CGCXXABI &CXXABI)
Definition: ABIInfoImpl.cpp:110

clang::CodeGen::classifyReturnType
bool classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI, const ABIInfo &Info)
Definition: ABIInfoImpl.cpp:128

clang::CodeGen::createAMDGPUTargetCodeGenInfo
std::unique_ptr< TargetCodeGenInfo > createAMDGPUTargetCodeGenInfo(CodeGenModule &CGM)
Definition: AMDGPU.cpp:714

clang::CodeGen::isAggregateTypeForABI
bool isAggregateTypeForABI(QualType T)
Definition: ABIInfoImpl.cpp:100

clang::CodeGen::isSingleElementStruct
const Type * isSingleElementStruct(QualType T, ASTContext &Context)
isSingleElementStruct - Determine if a structure is a "single element struct", i.e.
Definition: ABIInfoImpl.cpp:310

clang::CodeGen::useFirstFieldIfTransparentUnion
QualType useFirstFieldIfTransparentUnion(QualType Ty)
Pass transparent unions as if they were the type of the first element.
Definition: ABIInfoImpl.cpp:142

clang::CodeGen::isEmptyRecord
bool isEmptyRecord(ASTContext &Context, QualType T, bool AllowArrays, bool AsIfNoUniqueAddr=false)
isEmptyRecord - Return true iff a structure contains only empty fields.
Definition: ABIInfoImpl.cpp:289

clang::index::SymbolKind::Field
@ Field

clang::interp::Cast
bool Cast(InterpState &S, CodePtr OpPC)
Definition: Interp.h:1717

clang::syntax::NodeRole::Size
@ Size

clang
The JSON file list parser is used to communicate input to InstallAPI.
Definition: CalledOnceCheck.h:17

clang::LangAS
LangAS
Defines the address space values used by the address space qualifier of QualType.
Definition: AddressSpaces.h:25

clang::LangAS::cuda_device
@ cuda_device

clang::LangAS::opencl_global
@ opencl_global

clang::LangAS::opencl_constant
@ opencl_constant

clang::LangAS::Default
@ Default

clang::LangAS::opencl_private
@ opencl_private

clang::LangAS::opencl_generic
@ opencl_generic

clang::T
const FunctionProtoType * T
Definition: RecursiveASTVisitor.h:1355

clang::SyncScope
SyncScope
Defines synch scope values used internally by clang.
Definition: SyncScope.h:42

clang::SyncScope::WavefrontScope
@ WavefrontScope

clang::SyncScope::HIPSingleThread
@ HIPSingleThread

clang::SyncScope::OpenCLAllSVMDevices
@ OpenCLAllSVMDevices

clang::SyncScope::WorkgroupScope
@ WorkgroupScope

clang::SyncScope::HIPWorkgroup
@ HIPWorkgroup

clang::SyncScope::HIPSystem
@ HIPSystem

clang::SyncScope::SystemScope
@ SystemScope

clang::SyncScope::OpenCLWorkGroup
@ OpenCLWorkGroup

clang::SyncScope::OpenCLSubGroup
@ OpenCLSubGroup

clang::SyncScope::DeviceScope
@ DeviceScope

clang::SyncScope::HIPAgent
@ HIPAgent

clang::SyncScope::OpenCLDevice
@ OpenCLDevice

clang::SyncScope::HIPWavefront
@ HIPWavefront

clang::SyncScope::SingleScope
@ SingleScope

clang::OpenACCReductionOperator::Max
@ Max
'max'.

clang::OpenACCReductionOperator::Min
@ Min
'min'.

clang::CC_OpenCLKernel
@ CC_OpenCLKernel
Definition: Specifiers.h:289

clang::getLangASFromTargetAS
LangAS getLangASFromTargetAS(unsigned TargetAS)
Definition: AddressSpaces.h:86

clang::HiddenVisibility
@ HiddenVisibility
Objects with "hidden" visibility are not seen by the dynamic linker.
Definition: Visibility.h:37

clang::ProtectedVisibility
@ ProtectedVisibility
Objects with "protected" visibility are seen by the dynamic linker but always dynamically resolve to ...
Definition: Visibility.h:42

hlsl::uint64_t
unsigned long uint64_t
Definition: hlsl_basic_types.h:32

std
Definition: Format.h:5433