/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This provides a class for CUDA code generation targeting the NVIDIA CUDA |
10 | | // runtime library. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "CGCUDARuntime.h" |
15 | | #include "CGCXXABI.h" |
16 | | #include "CodeGenFunction.h" |
17 | | #include "CodeGenModule.h" |
18 | | #include "clang/AST/Decl.h" |
19 | | #include "clang/Basic/Cuda.h" |
20 | | #include "clang/CodeGen/CodeGenABITypes.h" |
21 | | #include "clang/CodeGen/ConstantInitBuilder.h" |
22 | | #include "llvm/IR/BasicBlock.h" |
23 | | #include "llvm/IR/Constants.h" |
24 | | #include "llvm/IR/DerivedTypes.h" |
25 | | #include "llvm/IR/ReplaceConstant.h" |
26 | | #include "llvm/Support/Format.h" |
27 | | |
28 | | using namespace clang; |
29 | | using namespace CodeGen; |
30 | | |
31 | | namespace { |
32 | | constexpr unsigned CudaFatMagic = 0x466243b1; |
33 | | constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" |
34 | | |
35 | | class CGNVCUDARuntime : public CGCUDARuntime { |
36 | | |
37 | | private: |
38 | | llvm::IntegerType *IntTy, *SizeTy; |
39 | | llvm::Type *VoidTy; |
40 | | llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; |
41 | | |
42 | | /// Convenience reference to LLVM Context |
43 | | llvm::LLVMContext &Context; |
44 | | /// Convenience reference to the current module |
45 | | llvm::Module &TheModule; |
46 | | /// Keeps track of kernel launch stubs and handles emitted in this module |
47 | | struct KernelInfo { |
48 | | llvm::Function *Kernel; // stub function to help launch kernel |
49 | | const Decl *D; |
50 | | }; |
51 | | llvm::SmallVector<KernelInfo, 16> EmittedKernels; |
52 | | // Map a device stub function to a symbol for identifying kernel in host code. |
53 | | // For CUDA, the symbol for identifying the kernel is the same as the device |
54 | | // stub function. For HIP, they are different. |
55 | | llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles; |
56 | | // Map a kernel handle to the kernel stub. |
57 | | llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs; |
58 | | struct VarInfo { |
59 | | llvm::GlobalVariable *Var; |
60 | | const VarDecl *D; |
61 | | DeviceVarFlags Flags; |
62 | | }; |
63 | | llvm::SmallVector<VarInfo, 16> DeviceVars; |
64 | | /// Keeps track of variable containing handle of GPU binary. Populated by |
65 | | /// ModuleCtorFunction() and used to create corresponding cleanup calls in |
66 | | /// ModuleDtorFunction() |
67 | | llvm::GlobalVariable *GpuBinaryHandle = nullptr; |
68 | | /// Whether we generate relocatable device code. |
69 | | bool RelocatableDeviceCode; |
70 | | /// Mangle context for device. |
71 | | std::unique_ptr<MangleContext> DeviceMC; |
72 | | |
73 | | llvm::FunctionCallee getSetupArgumentFn() const; |
74 | | llvm::FunctionCallee getLaunchFn() const; |
75 | | |
76 | | llvm::FunctionType *getRegisterGlobalsFnTy() const; |
77 | | llvm::FunctionType *getCallbackFnTy() const; |
78 | | llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; |
79 | | std::string addPrefixToName(StringRef FuncName) const; |
80 | | std::string addUnderscoredPrefixToName(StringRef FuncName) const; |
81 | | |
82 | | /// Creates a function to register all kernel stubs generated in this module. |
83 | | llvm::Function *makeRegisterGlobalsFn(); |
84 | | |
85 | | /// Helper function that generates a constant string and returns a pointer to |
86 | | /// the start of the string. The result of this function can be used anywhere |
87 | | /// where the C code specifies const char*. |
88 | | llvm::Constant *makeConstantString(const std::string &Str, |
89 | | const std::string &Name = "", |
90 | | const std::string &SectionName = "", |
91 | 214 | unsigned Alignment = 0) { |
92 | 214 | llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), |
93 | 214 | llvm::ConstantInt::get(SizeTy, 0)}; |
94 | 214 | auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); |
95 | 214 | llvm::GlobalVariable *GV = |
96 | 214 | cast<llvm::GlobalVariable>(ConstStr.getPointer()); |
97 | 214 | if (!SectionName.empty()) { |
98 | 24 | GV->setSection(SectionName); |
99 | | // Mark the address as used which make sure that this section isn't |
100 | | // merged and we will really have it in the object file. |
101 | 24 | GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None); |
102 | 24 | } |
103 | 214 | if (Alignment) |
104 | 24 | GV->setAlignment(llvm::Align(Alignment)); |
105 | | |
106 | 214 | return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), |
107 | 214 | ConstStr.getPointer(), Zeros); |
108 | 214 | } |
109 | | |
110 | | /// Helper function that generates an empty dummy function returning void. |
111 | 4 | llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { |
112 | 4 | assert(FnTy->getReturnType()->isVoidTy() && |
113 | 4 | "Can only generate dummy functions returning void!"); |
114 | 0 | llvm::Function *DummyFunc = llvm::Function::Create( |
115 | 4 | FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); |
116 | | |
117 | 4 | llvm::BasicBlock *DummyBlock = |
118 | 4 | llvm::BasicBlock::Create(Context, "", DummyFunc); |
119 | 4 | CGBuilderTy FuncBuilder(CGM, Context); |
120 | 4 | FuncBuilder.SetInsertPoint(DummyBlock); |
121 | 4 | FuncBuilder.CreateRetVoid(); |
122 | | |
123 | 4 | return DummyFunc; |
124 | 4 | } |
125 | | |
126 | | void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args); |
127 | | void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args); |
128 | | std::string getDeviceSideName(const NamedDecl *ND) override; |
129 | | |
130 | | void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var, |
131 | 302 | bool Extern, bool Constant) { |
132 | 302 | DeviceVars.push_back({&Var, |
133 | 302 | VD, |
134 | 302 | {DeviceVarFlags::Variable, Extern, Constant, |
135 | 302 | VD->hasAttr<HIPManagedAttr>(), |
136 | 302 | /*Normalized*/ false, 0}}); |
137 | 302 | } |
138 | | void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var, |
139 | 0 | bool Extern, int Type) { |
140 | 0 | DeviceVars.push_back({&Var, |
141 | 0 | VD, |
142 | 0 | {DeviceVarFlags::Surface, Extern, /*Constant*/ false, |
143 | 0 | /*Managed*/ false, |
144 | 0 | /*Normalized*/ false, Type}}); |
145 | 0 | } |
146 | | void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var, |
147 | 0 | bool Extern, int Type, bool Normalized) { |
148 | 0 | DeviceVars.push_back({&Var, |
149 | 0 | VD, |
150 | 0 | {DeviceVarFlags::Texture, Extern, /*Constant*/ false, |
151 | 0 | /*Managed*/ false, Normalized, Type}}); |
152 | 0 | } |
153 | | |
154 | | /// Creates module constructor function |
155 | | llvm::Function *makeModuleCtorFunction(); |
156 | | /// Creates module destructor function |
157 | | llvm::Function *makeModuleDtorFunction(); |
158 | | /// Transform managed variables for device compilation. |
159 | | void transformManagedVars(); |
160 | | /// Create offloading entries to register globals in RDC mode. |
161 | | void createOffloadingEntries(); |
162 | | |
163 | | public: |
164 | | CGNVCUDARuntime(CodeGenModule &CGM); |
165 | | |
166 | | llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override; |
167 | 59 | llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override { |
168 | 59 | auto Loc = KernelStubs.find(Handle); |
169 | 59 | assert(Loc != KernelStubs.end()); |
170 | 0 | return Loc->second; |
171 | 59 | } |
172 | | void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; |
173 | | void handleVarRegistration(const VarDecl *VD, |
174 | | llvm::GlobalVariable &Var) override; |
175 | | void |
176 | | internalizeDeviceSideVar(const VarDecl *D, |
177 | | llvm::GlobalValue::LinkageTypes &Linkage) override; |
178 | | |
179 | | llvm::Function *finalizeModule() override; |
180 | | }; |
181 | | |
182 | | } // end anonymous namespace |
183 | | |
184 | 87 | std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const { |
185 | 87 | if (CGM.getLangOpts().HIP) |
186 | 62 | return ((Twine("hip") + Twine(FuncName)).str()); |
187 | 25 | return ((Twine("cuda") + Twine(FuncName)).str()); |
188 | 87 | } |
189 | | std::string |
190 | 562 | CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const { |
191 | 562 | if (CGM.getLangOpts().HIP) |
192 | 454 | return ((Twine("__hip") + Twine(FuncName)).str()); |
193 | 108 | return ((Twine("__cuda") + Twine(FuncName)).str()); |
194 | 562 | } |
195 | | |
196 | 216 | static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) { |
197 | | // If the host and device have different C++ ABIs, mark it as the device |
198 | | // mangle context so that the mangling needs to retrieve the additional |
199 | | // device lambda mangling number instead of the regular host one. |
200 | 216 | if (CGM.getContext().getAuxTargetInfo() && |
201 | 216 | CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft()30 && |
202 | 216 | CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()8 ) { |
203 | 8 | return std::unique_ptr<MangleContext>( |
204 | 8 | CGM.getContext().createDeviceMangleContext( |
205 | 8 | *CGM.getContext().getAuxTargetInfo())); |
206 | 8 | } |
207 | | |
208 | 208 | return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext( |
209 | 208 | CGM.getContext().getAuxTargetInfo())); |
210 | 216 | } |
211 | | |
212 | | CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) |
213 | | : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), |
214 | | TheModule(CGM.getModule()), |
215 | | RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode), |
216 | 216 | DeviceMC(InitDeviceMC(CGM)) { |
217 | 216 | CodeGen::CodeGenTypes &Types = CGM.getTypes(); |
218 | 216 | ASTContext &Ctx = CGM.getContext(); |
219 | | |
220 | 216 | IntTy = CGM.IntTy; |
221 | 216 | SizeTy = CGM.SizeTy; |
222 | 216 | VoidTy = CGM.VoidTy; |
223 | | |
224 | 216 | CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); |
225 | 216 | VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy)); |
226 | 216 | VoidPtrPtrTy = VoidPtrTy->getPointerTo(); |
227 | 216 | } |
228 | | |
229 | 78 | llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const { |
230 | | // cudaError_t cudaSetupArgument(void *, size_t, size_t) |
231 | 78 | llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy}; |
232 | 78 | return CGM.CreateRuntimeFunction( |
233 | 78 | llvm::FunctionType::get(IntTy, Params, false), |
234 | 78 | addPrefixToName("SetupArgument")); |
235 | 78 | } |
236 | | |
237 | 78 | llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const { |
238 | 78 | if (CGM.getLangOpts().HIP) { |
239 | | // hipError_t hipLaunchByPtr(char *); |
240 | 59 | return CGM.CreateRuntimeFunction( |
241 | 59 | llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr"); |
242 | 59 | } |
243 | | // cudaError_t cudaLaunch(char *); |
244 | 19 | return CGM.CreateRuntimeFunction( |
245 | 19 | llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); |
246 | 78 | } |
247 | | |
248 | 55 | llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { |
249 | 55 | return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); |
250 | 55 | } |
251 | | |
252 | 8 | llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { |
253 | 8 | return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); |
254 | 8 | } |
255 | | |
256 | 4 | llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { |
257 | 4 | auto *CallbackFnTy = getCallbackFnTy(); |
258 | 4 | auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); |
259 | 4 | llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, |
260 | 4 | VoidPtrTy, CallbackFnTy->getPointerTo()}; |
261 | 4 | return llvm::FunctionType::get(VoidTy, Params, false); |
262 | 4 | } |
263 | | |
264 | 310 | std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) { |
265 | 310 | GlobalDecl GD; |
266 | | // D could be either a kernel or a variable. |
267 | 310 | if (auto *FD = dyn_cast<FunctionDecl>(ND)) |
268 | 188 | GD = GlobalDecl(FD, KernelReferenceKind::Kernel); |
269 | 122 | else |
270 | 122 | GD = GlobalDecl(ND); |
271 | 310 | std::string DeviceSideName; |
272 | 310 | MangleContext *MC; |
273 | 310 | if (CGM.getLangOpts().CUDAIsDevice) |
274 | 0 | MC = &CGM.getCXXABI().getMangleContext(); |
275 | 310 | else |
276 | 310 | MC = DeviceMC.get(); |
277 | 310 | if (MC->shouldMangleDeclName(ND)) { |
278 | 213 | SmallString<256> Buffer; |
279 | 213 | llvm::raw_svector_ostream Out(Buffer); |
280 | 213 | MC->mangleName(GD, Out); |
281 | 213 | DeviceSideName = std::string(Out.str()); |
282 | 213 | } else |
283 | 97 | DeviceSideName = std::string(ND->getIdentifier()->getName()); |
284 | | |
285 | | // Make unique name for device side static file-scope variable for HIP. |
286 | 310 | if (CGM.getContext().shouldExternalize(ND) && |
287 | 310 | CGM.getLangOpts().GPURelocatableDeviceCode45 ) { |
288 | 19 | SmallString<256> Buffer; |
289 | 19 | llvm::raw_svector_ostream Out(Buffer); |
290 | 19 | Out << DeviceSideName; |
291 | 19 | CGM.printPostfixForExternalizedDecl(Out, ND); |
292 | 19 | DeviceSideName = std::string(Out.str()); |
293 | 19 | } |
294 | 310 | return DeviceSideName; |
295 | 310 | } |
296 | | |
297 | | void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, |
298 | 87 | FunctionArgList &Args) { |
299 | 87 | EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl}); |
300 | 87 | if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) { |
301 | 62 | GV->setLinkage(CGF.CurFn->getLinkage()); |
302 | 62 | GV->setInitializer(CGF.CurFn); |
303 | 62 | } |
304 | 87 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
305 | 87 | CudaFeature::CUDA_USES_NEW_LAUNCH) || |
306 | 87 | (81 CGF.getLangOpts().HIP81 && CGF.getLangOpts().HIPUseNewLaunchAPI62 )) |
307 | 9 | emitDeviceStubBodyNew(CGF, Args); |
308 | 78 | else |
309 | 78 | emitDeviceStubBodyLegacy(CGF, Args); |
310 | 87 | } |
311 | | |
312 | | // CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local |
313 | | // array and kernels are launched using cudaLaunchKernel(). |
314 | | void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF, |
315 | 9 | FunctionArgList &Args) { |
316 | | // Build the shadow stack entry at the very start of the function. |
317 | | |
318 | | // Calculate amount of space we will need for all arguments. If we have no |
319 | | // args, allocate a single pointer so we still have a valid pointer to the |
320 | | // argument array that we can pass to runtime, even if it will be unused. |
321 | 9 | Address KernelArgs = CGF.CreateTempAlloca( |
322 | 9 | VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args", |
323 | 9 | llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size()))); |
324 | | // Store pointers to the arguments in a locally allocated launch_args. |
325 | 28 | for (unsigned i = 0; i < Args.size(); ++i19 ) { |
326 | 19 | llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer(); |
327 | 19 | llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy); |
328 | 19 | CGF.Builder.CreateDefaultAlignedStore( |
329 | 19 | VoidVarPtr, |
330 | 19 | CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i)); |
331 | 19 | } |
332 | | |
333 | 9 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
334 | | |
335 | | // Lookup cudaLaunchKernel/hipLaunchKernel function. |
336 | | // HIP kernel launching API name depends on -fgpu-default-stream option. For |
337 | | // the default value 'legacy', it is hipLaunchKernel. For 'per-thread', |
338 | | // it is hipLaunchKernel_spt. |
339 | | // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, |
340 | | // void **args, size_t sharedMem, |
341 | | // cudaStream_t stream); |
342 | | // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim, |
343 | | // dim3 blockDim, void **args, |
344 | | // size_t sharedMem, hipStream_t stream); |
345 | 9 | TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl(); |
346 | 9 | DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl); |
347 | 9 | std::string KernelLaunchAPI = "LaunchKernel"; |
348 | 9 | if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream == |
349 | 3 | LangOptions::GPUDefaultStreamKind::PerThread) |
350 | 1 | KernelLaunchAPI = KernelLaunchAPI + "_spt"; |
351 | 9 | auto LaunchKernelName = addPrefixToName(KernelLaunchAPI); |
352 | 9 | IdentifierInfo &cudaLaunchKernelII = |
353 | 9 | CGM.getContext().Idents.get(LaunchKernelName); |
354 | 9 | FunctionDecl *cudaLaunchKernelFD = nullptr; |
355 | 9 | for (auto *Result : DC->lookup(&cudaLaunchKernelII)) { |
356 | 9 | if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result)) |
357 | 9 | cudaLaunchKernelFD = FD; |
358 | 9 | } |
359 | | |
360 | 9 | if (cudaLaunchKernelFD == nullptr) { |
361 | 0 | CGM.Error(CGF.CurFuncDecl->getLocation(), |
362 | 0 | "Can't find declaration for " + LaunchKernelName); |
363 | 0 | return; |
364 | 0 | } |
365 | | // Create temporary dim3 grid_dim, block_dim. |
366 | 9 | ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1); |
367 | 9 | QualType Dim3Ty = GridDimParam->getType(); |
368 | 9 | Address GridDim = |
369 | 9 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim"); |
370 | 9 | Address BlockDim = |
371 | 9 | CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim"); |
372 | 9 | Address ShmemSize = |
373 | 9 | CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size"); |
374 | 9 | Address Stream = |
375 | 9 | CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream"); |
376 | 9 | llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction( |
377 | 9 | llvm::FunctionType::get(IntTy, |
378 | 9 | {/*gridDim=*/GridDim.getType(), |
379 | 9 | /*blockDim=*/BlockDim.getType(), |
380 | 9 | /*ShmemSize=*/ShmemSize.getType(), |
381 | 9 | /*Stream=*/Stream.getType()}, |
382 | 9 | /*isVarArg=*/false), |
383 | 9 | addUnderscoredPrefixToName("PopCallConfiguration")); |
384 | | |
385 | 9 | CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn, |
386 | 9 | {GridDim.getPointer(), BlockDim.getPointer(), |
387 | 9 | ShmemSize.getPointer(), Stream.getPointer()}); |
388 | | |
389 | | // Emit the call to cudaLaunch |
390 | 9 | llvm::Value *Kernel = |
391 | 9 | CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy); |
392 | 9 | CallArgList LaunchKernelArgs; |
393 | 9 | LaunchKernelArgs.add(RValue::get(Kernel), |
394 | 9 | cudaLaunchKernelFD->getParamDecl(0)->getType()); |
395 | 9 | LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty); |
396 | 9 | LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty); |
397 | 9 | LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()), |
398 | 9 | cudaLaunchKernelFD->getParamDecl(3)->getType()); |
399 | 9 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)), |
400 | 9 | cudaLaunchKernelFD->getParamDecl(4)->getType()); |
401 | 9 | LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)), |
402 | 9 | cudaLaunchKernelFD->getParamDecl(5)->getType()); |
403 | | |
404 | 9 | QualType QT = cudaLaunchKernelFD->getType(); |
405 | 9 | QualType CQT = QT.getCanonicalType(); |
406 | 9 | llvm::Type *Ty = CGM.getTypes().ConvertType(CQT); |
407 | 9 | llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty); |
408 | | |
409 | 9 | const CGFunctionInfo &FI = |
410 | 9 | CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD); |
411 | 9 | llvm::FunctionCallee cudaLaunchKernelFn = |
412 | 9 | CGM.CreateRuntimeFunction(FTy, LaunchKernelName); |
413 | 9 | CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(), |
414 | 9 | LaunchKernelArgs); |
415 | 9 | CGF.EmitBranch(EndBlock); |
416 | | |
417 | 9 | CGF.EmitBlock(EndBlock); |
418 | 9 | } |
419 | | |
420 | | void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF, |
421 | 78 | FunctionArgList &Args) { |
422 | | // Emit a call to cudaSetupArgument for each arg in Args. |
423 | 78 | llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn(); |
424 | 78 | llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); |
425 | 78 | CharUnits Offset = CharUnits::Zero(); |
426 | 82 | for (const VarDecl *A : Args) { |
427 | 82 | auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType()); |
428 | 82 | Offset = Offset.alignTo(TInfo.Align); |
429 | 82 | llvm::Value *Args[] = { |
430 | 82 | CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(), |
431 | 82 | VoidPtrTy), |
432 | 82 | llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()), |
433 | 82 | llvm::ConstantInt::get(SizeTy, Offset.getQuantity()), |
434 | 82 | }; |
435 | 82 | llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args); |
436 | 82 | llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0); |
437 | 82 | llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero); |
438 | 82 | llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next"); |
439 | 82 | CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock); |
440 | 82 | CGF.EmitBlock(NextBlock); |
441 | 82 | Offset += TInfo.Width; |
442 | 82 | } |
443 | | |
444 | | // Emit the call to cudaLaunch |
445 | 78 | llvm::FunctionCallee cudaLaunchFn = getLaunchFn(); |
446 | 78 | llvm::Value *Arg = |
447 | 78 | CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy); |
448 | 78 | CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg); |
449 | 78 | CGF.EmitBranch(EndBlock); |
450 | | |
451 | 78 | CGF.EmitBlock(EndBlock); |
452 | 78 | } |
453 | | |
454 | | // Replace the original variable Var with the address loaded from variable |
455 | | // ManagedVar populated by HIP runtime. |
456 | | static void replaceManagedVar(llvm::GlobalVariable *Var, |
457 | 34 | llvm::GlobalVariable *ManagedVar) { |
458 | 34 | SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList; |
459 | 40 | for (auto &&VarUse : Var->uses()) { |
460 | 40 | WorkList.push_back({VarUse.getUser()}); |
461 | 40 | } |
462 | 120 | while (!WorkList.empty()) { |
463 | 86 | auto &&WorkItem = WorkList.pop_back_val(); |
464 | 86 | auto *U = WorkItem.back(); |
465 | 86 | if (isa<llvm::ConstantExpr>(U)) { |
466 | 46 | for (auto &&UU : U->uses()) { |
467 | 46 | WorkItem.push_back(UU.getUser()); |
468 | 46 | WorkList.push_back(WorkItem); |
469 | 46 | WorkItem.pop_back(); |
470 | 46 | } |
471 | 40 | continue; |
472 | 40 | } |
473 | 46 | if (auto *I = dyn_cast<llvm::Instruction>(U)) { |
474 | 46 | llvm::Value *OldV = Var; |
475 | 46 | llvm::Instruction *NewV = |
476 | 46 | new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false, |
477 | 46 | llvm::Align(Var->getAlignment()), I); |
478 | 46 | WorkItem.pop_back(); |
479 | | // Replace constant expressions directly or indirectly using the managed |
480 | | // variable with instructions. |
481 | 46 | for (auto &&Op : WorkItem) { |
482 | 44 | auto *CE = cast<llvm::ConstantExpr>(Op); |
483 | 44 | auto *NewInst = CE->getAsInstruction(I); |
484 | 44 | NewInst->replaceUsesOfWith(OldV, NewV); |
485 | 44 | OldV = CE; |
486 | 44 | NewV = NewInst; |
487 | 44 | } |
488 | 46 | I->replaceUsesOfWith(OldV, NewV); |
489 | 46 | } else { |
490 | 0 | llvm_unreachable("Invalid use of managed variable"); |
491 | 0 | } |
492 | 46 | } |
493 | 34 | } |
494 | | |
495 | | /// Creates a function that sets up state on the host side for CUDA objects that |
496 | | /// have a presence on both the host and device sides. Specifically, registers |
497 | | /// the host side of kernel functions and device global variables with the CUDA |
498 | | /// runtime. |
499 | | /// \code |
500 | | /// void __cuda_register_globals(void** GpuBinaryHandle) { |
501 | | /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); |
502 | | /// ... |
503 | | /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); |
504 | | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); |
505 | | /// ... |
506 | | /// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); |
507 | | /// } |
508 | | /// \endcode |
509 | 51 | llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { |
510 | | // No need to register anything |
511 | 51 | if (EmittedKernels.empty() && DeviceVars.empty()5 ) |
512 | 0 | return nullptr; |
513 | | |
514 | 51 | llvm::Function *RegisterKernelsFunc = llvm::Function::Create( |
515 | 51 | getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, |
516 | 51 | addUnderscoredPrefixToName("_register_globals"), &TheModule); |
517 | 51 | llvm::BasicBlock *EntryBB = |
518 | 51 | llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); |
519 | 51 | CGBuilderTy Builder(CGM, Context); |
520 | 51 | Builder.SetInsertPoint(EntryBB); |
521 | | |
522 | | // void __cudaRegisterFunction(void **, const char *, char *, const char *, |
523 | | // int, uint3*, uint3*, dim3*, dim3*, int*) |
524 | 51 | llvm::Type *RegisterFuncParams[] = { |
525 | 51 | VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, |
526 | 51 | VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; |
527 | 51 | llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction( |
528 | 51 | llvm::FunctionType::get(IntTy, RegisterFuncParams, false), |
529 | 51 | addUnderscoredPrefixToName("RegisterFunction")); |
530 | | |
531 | | // Extract GpuBinaryHandle passed as the first argument passed to |
532 | | // __cuda_register_globals() and generate __cudaRegisterFunction() call for |
533 | | // each emitted kernel. |
534 | 51 | llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); |
535 | 72 | for (auto &&I : EmittedKernels) { |
536 | 72 | llvm::Constant *KernelName = |
537 | 72 | makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D))); |
538 | 72 | llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); |
539 | 72 | llvm::Value *Args[] = { |
540 | 72 | &GpuBinaryHandlePtr, |
541 | 72 | Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy), |
542 | 72 | KernelName, |
543 | 72 | KernelName, |
544 | 72 | llvm::ConstantInt::get(IntTy, -1), |
545 | 72 | NullPtr, |
546 | 72 | NullPtr, |
547 | 72 | NullPtr, |
548 | 72 | NullPtr, |
549 | 72 | llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; |
550 | 72 | Builder.CreateCall(RegisterFunc, Args); |
551 | 72 | } |
552 | | |
553 | 51 | llvm::Type *VarSizeTy = IntTy; |
554 | | // For HIP or CUDA 9.0+, device variable size is type of `size_t`. |
555 | 51 | if (CGM.getLangOpts().HIP || |
556 | 51 | ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_9010 ) |
557 | 45 | VarSizeTy = SizeTy; |
558 | | |
559 | | // void __cudaRegisterVar(void **, char *, char *, const char *, |
560 | | // int, int, int, int) |
561 | 51 | llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, |
562 | 51 | CharPtrTy, IntTy, VarSizeTy, |
563 | 51 | IntTy, IntTy}; |
564 | 51 | llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction( |
565 | 51 | llvm::FunctionType::get(VoidTy, RegisterVarParams, false), |
566 | 51 | addUnderscoredPrefixToName("RegisterVar")); |
567 | | // void __hipRegisterManagedVar(void **, char *, char *, const char *, |
568 | | // size_t, unsigned) |
569 | 51 | llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy, |
570 | 51 | CharPtrTy, VarSizeTy, IntTy}; |
571 | 51 | llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction( |
572 | 51 | llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false), |
573 | 51 | addUnderscoredPrefixToName("RegisterManagedVar")); |
574 | | // void __cudaRegisterSurface(void **, const struct surfaceReference *, |
575 | | // const void **, const char *, int, int); |
576 | 51 | llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction( |
577 | 51 | llvm::FunctionType::get( |
578 | 51 | VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy}, |
579 | 51 | false), |
580 | 51 | addUnderscoredPrefixToName("RegisterSurface")); |
581 | | // void __cudaRegisterTexture(void **, const struct textureReference *, |
582 | | // const void **, const char *, int, int, int) |
583 | 51 | llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction( |
584 | 51 | llvm::FunctionType::get( |
585 | 51 | VoidTy, |
586 | 51 | {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy}, |
587 | 51 | false), |
588 | 51 | addUnderscoredPrefixToName("RegisterTexture")); |
589 | 118 | for (auto &&Info : DeviceVars) { |
590 | 118 | llvm::GlobalVariable *Var = Info.Var; |
591 | 118 | assert((!Var->isDeclaration() || Info.Flags.isManaged()) && |
592 | 118 | "External variables should not show up here, except HIP managed " |
593 | 118 | "variables"); |
594 | 0 | llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); |
595 | 118 | switch (Info.Flags.getKind()) { |
596 | 118 | case DeviceVarFlags::Variable: { |
597 | 118 | uint64_t VarSize = |
598 | 118 | CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); |
599 | 118 | if (Info.Flags.isManaged()) { |
600 | 16 | auto *ManagedVar = new llvm::GlobalVariable( |
601 | 16 | CGM.getModule(), Var->getType(), |
602 | 16 | /*isConstant=*/false, Var->getLinkage(), |
603 | 16 | /*Init=*/Var->isDeclaration() |
604 | 16 | ? nullptr4 |
605 | 16 | : llvm::ConstantPointerNull::get(Var->getType())12 , |
606 | 16 | /*Name=*/"", /*InsertBefore=*/nullptr, |
607 | 16 | llvm::GlobalVariable::NotThreadLocal); |
608 | 16 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
609 | 16 | ManagedVar->setVisibility(Var->getVisibility()); |
610 | 16 | ManagedVar->setExternallyInitialized(true); |
611 | 16 | ManagedVar->takeName(Var); |
612 | 16 | Var->setName(Twine(ManagedVar->getName() + ".managed")); |
613 | 16 | replaceManagedVar(Var, ManagedVar); |
614 | 16 | llvm::Value *Args[] = { |
615 | 16 | &GpuBinaryHandlePtr, |
616 | 16 | Builder.CreateBitCast(ManagedVar, VoidPtrTy), |
617 | 16 | Builder.CreateBitCast(Var, VoidPtrTy), |
618 | 16 | VarName, |
619 | 16 | llvm::ConstantInt::get(VarSizeTy, VarSize), |
620 | 16 | llvm::ConstantInt::get(IntTy, Var->getAlignment())}; |
621 | 16 | if (!Var->isDeclaration()) |
622 | 12 | Builder.CreateCall(RegisterManagedVar, Args); |
623 | 102 | } else { |
624 | 102 | llvm::Value *Args[] = { |
625 | 102 | &GpuBinaryHandlePtr, |
626 | 102 | Builder.CreateBitCast(Var, VoidPtrTy), |
627 | 102 | VarName, |
628 | 102 | VarName, |
629 | 102 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()), |
630 | 102 | llvm::ConstantInt::get(VarSizeTy, VarSize), |
631 | 102 | llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()), |
632 | 102 | llvm::ConstantInt::get(IntTy, 0)}; |
633 | 102 | Builder.CreateCall(RegisterVar, Args); |
634 | 102 | } |
635 | 118 | break; |
636 | 0 | } |
637 | 0 | case DeviceVarFlags::Surface: |
638 | 0 | Builder.CreateCall( |
639 | 0 | RegisterSurf, |
640 | 0 | {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, |
641 | 0 | VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), |
642 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); |
643 | 0 | break; |
644 | 0 | case DeviceVarFlags::Texture: |
645 | 0 | Builder.CreateCall( |
646 | 0 | RegisterTex, |
647 | 0 | {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName, |
648 | 0 | VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()), |
649 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()), |
650 | 0 | llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())}); |
651 | 0 | break; |
652 | 118 | } |
653 | 118 | } |
654 | | |
655 | 51 | Builder.CreateRetVoid(); |
656 | 51 | return RegisterKernelsFunc; |
657 | 51 | } |
658 | | |
659 | | /// Creates a global constructor function for the module: |
660 | | /// |
661 | | /// For CUDA: |
662 | | /// \code |
663 | | /// void __cuda_module_ctor() { |
664 | | /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); |
665 | | /// __cuda_register_globals(Handle); |
666 | | /// } |
667 | | /// \endcode |
668 | | /// |
669 | | /// For HIP: |
670 | | /// \code |
671 | | /// void __hip_module_ctor() { |
672 | | /// if (__hip_gpubin_handle == 0) { |
673 | | /// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); |
674 | | /// __hip_register_globals(__hip_gpubin_handle); |
675 | | /// } |
676 | | /// } |
677 | | /// \endcode |
678 | 71 | llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { |
679 | 71 | bool IsHIP = CGM.getLangOpts().HIP; |
680 | 71 | bool IsCUDA = CGM.getLangOpts().CUDA; |
681 | | // No need to generate ctors/dtors if there is no GPU binary. |
682 | 71 | StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; |
683 | 71 | if (CudaGpuBinaryFileName.empty() && !IsHIP48 ) |
684 | 13 | return nullptr; |
685 | 58 | if ((IsHIP || (12 IsCUDA12 && !RelocatableDeviceCode12 )) && EmittedKernels.empty()54 && |
686 | 58 | DeviceVars.empty()12 ) |
687 | 7 | return nullptr; |
688 | | |
689 | | // void __{cuda|hip}_register_globals(void* handle); |
690 | 51 | llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); |
691 | | // We always need a function to pass in as callback. Create a dummy |
692 | | // implementation if we don't need to register anything. |
693 | 51 | if (RelocatableDeviceCode && !RegisterGlobalsFunc12 ) |
694 | 0 | RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); |
695 | | |
696 | | // void ** __{cuda|hip}RegisterFatBinary(void *); |
697 | 51 | llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction( |
698 | 51 | llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), |
699 | 51 | addUnderscoredPrefixToName("RegisterFatBinary")); |
700 | | // struct { int magic, int version, void * gpu_binary, void * dont_care }; |
701 | 51 | llvm::StructType *FatbinWrapperTy = |
702 | 51 | llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); |
703 | | |
704 | | // Register GPU binary with the CUDA runtime, store returned handle in a |
705 | | // global variable and save a reference in GpuBinaryHandle to be cleaned up |
706 | | // in destructor on exit. Then associate all known kernels with the GPU binary |
707 | | // handle so CUDA runtime can figure out what to call on the GPU side. |
708 | 51 | std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr; |
709 | 51 | if (!CudaGpuBinaryFileName.empty()) { |
710 | 20 | llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr = |
711 | 20 | llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); |
712 | 20 | if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { |
713 | 0 | CGM.getDiags().Report(diag::err_cannot_open_file) |
714 | 0 | << CudaGpuBinaryFileName << EC.message(); |
715 | 0 | return nullptr; |
716 | 0 | } |
717 | 20 | CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); |
718 | 20 | } |
719 | | |
720 | 51 | llvm::Function *ModuleCtorFunc = llvm::Function::Create( |
721 | 51 | llvm::FunctionType::get(VoidTy, false), |
722 | 51 | llvm::GlobalValue::InternalLinkage, |
723 | 51 | addUnderscoredPrefixToName("_module_ctor"), &TheModule); |
724 | 51 | llvm::BasicBlock *CtorEntryBB = |
725 | 51 | llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); |
726 | 51 | CGBuilderTy CtorBuilder(CGM, Context); |
727 | | |
728 | 51 | CtorBuilder.SetInsertPoint(CtorEntryBB); |
729 | | |
730 | 51 | const char *FatbinConstantName; |
731 | 51 | const char *FatbinSectionName; |
732 | 51 | const char *ModuleIDSectionName; |
733 | 51 | StringRef ModuleIDPrefix; |
734 | 51 | llvm::Constant *FatBinStr; |
735 | 51 | unsigned FatMagic; |
736 | 51 | if (IsHIP) { |
737 | 41 | FatbinConstantName = ".hip_fatbin"; |
738 | 41 | FatbinSectionName = ".hipFatBinSegment"; |
739 | | |
740 | 41 | ModuleIDSectionName = "__hip_module_id"; |
741 | 41 | ModuleIDPrefix = "__hip_"; |
742 | | |
743 | 41 | if (CudaGpuBinary) { |
744 | | // If fatbin is available from early finalization, create a string |
745 | | // literal containing the fat binary loaded from the given file. |
746 | 10 | const unsigned HIPCodeObjectAlign = 4096; |
747 | 10 | FatBinStr = |
748 | 10 | makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", |
749 | 10 | FatbinConstantName, HIPCodeObjectAlign); |
750 | 31 | } else { |
751 | | // If fatbin is not available, create an external symbol |
752 | | // __hip_fatbin in section .hip_fatbin. The external symbol is supposed |
753 | | // to contain the fat binary but will be populated somewhere else, |
754 | | // e.g. by lld through link script. |
755 | 31 | FatBinStr = new llvm::GlobalVariable( |
756 | 31 | CGM.getModule(), CGM.Int8Ty, |
757 | 31 | /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, |
758 | 31 | "__hip_fatbin", nullptr, |
759 | 31 | llvm::GlobalVariable::NotThreadLocal); |
760 | 31 | cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName); |
761 | 31 | } |
762 | | |
763 | 41 | FatMagic = HIPFatMagic; |
764 | 41 | } else { |
765 | 10 | if (RelocatableDeviceCode) |
766 | 4 | FatbinConstantName = CGM.getTriple().isMacOSX() |
767 | 4 | ? "__NV_CUDA,__nv_relfatbin"0 |
768 | 4 | : "__nv_relfatbin"; |
769 | 6 | else |
770 | 6 | FatbinConstantName = |
771 | 6 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin"0 : ".nv_fatbin"; |
772 | | // NVIDIA's cuobjdump looks for fatbins in this section. |
773 | 10 | FatbinSectionName = |
774 | 10 | CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin"0 : ".nvFatBinSegment"; |
775 | | |
776 | 10 | ModuleIDSectionName = CGM.getTriple().isMacOSX() |
777 | 10 | ? "__NV_CUDA,__nv_module_id"0 |
778 | 10 | : "__nv_module_id"; |
779 | 10 | ModuleIDPrefix = "__nv_"; |
780 | | |
781 | | // For CUDA, create a string literal containing the fat binary loaded from |
782 | | // the given file. |
783 | 10 | FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "", |
784 | 10 | FatbinConstantName, 8); |
785 | 10 | FatMagic = CudaFatMagic; |
786 | 10 | } |
787 | | |
788 | | // Create initialized wrapper structure that points to the loaded GPU binary |
789 | 51 | ConstantInitBuilder Builder(CGM); |
790 | 51 | auto Values = Builder.beginStruct(FatbinWrapperTy); |
791 | | // Fatbin wrapper magic. |
792 | 51 | Values.addInt(IntTy, FatMagic); |
793 | | // Fatbin version. |
794 | 51 | Values.addInt(IntTy, 1); |
795 | | // Data. |
796 | 51 | Values.add(FatBinStr); |
797 | | // Unused in fatbin v1. |
798 | 51 | Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); |
799 | 51 | llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( |
800 | 51 | addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(), |
801 | 51 | /*constant*/ true); |
802 | 51 | FatbinWrapper->setSection(FatbinSectionName); |
803 | | |
804 | | // There is only one HIP fat binary per linked module, however there are |
805 | | // multiple constructor functions. Make sure the fat binary is registered |
806 | | // only once. The constructor functions are executed by the dynamic loader |
807 | | // before the program gains control. The dynamic loader cannot execute the |
808 | | // constructor functions concurrently since doing that would not guarantee |
809 | | // thread safety of the loaded program. Therefore we can assume sequential |
810 | | // execution of constructor functions here. |
811 | 51 | if (IsHIP) { |
812 | 41 | auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage10 : |
813 | 41 | llvm::GlobalValue::LinkOnceAnyLinkage31 ; |
814 | 41 | llvm::BasicBlock *IfBlock = |
815 | 41 | llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); |
816 | 41 | llvm::BasicBlock *ExitBlock = |
817 | 41 | llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); |
818 | | // The name, size, and initialization pattern of this variable is part |
819 | | // of HIP ABI. |
820 | 41 | GpuBinaryHandle = new llvm::GlobalVariable( |
821 | 41 | TheModule, VoidPtrPtrTy, /*isConstant=*/false, |
822 | 41 | Linkage, |
823 | 41 | /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), |
824 | 41 | "__hip_gpubin_handle"); |
825 | 41 | if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage) |
826 | 31 | GpuBinaryHandle->setComdat( |
827 | 31 | CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName())); |
828 | 41 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
829 | | // Prevent the weak symbol in different shared libraries being merged. |
830 | 41 | if (Linkage != llvm::GlobalValue::InternalLinkage) |
831 | 31 | GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility); |
832 | 41 | Address GpuBinaryAddr( |
833 | 41 | GpuBinaryHandle, VoidPtrPtrTy, |
834 | 41 | CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); |
835 | 41 | { |
836 | 41 | auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
837 | 41 | llvm::Constant *Zero = |
838 | 41 | llvm::Constant::getNullValue(HandleValue->getType()); |
839 | 41 | llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); |
840 | 41 | CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); |
841 | 41 | } |
842 | 41 | { |
843 | 41 | CtorBuilder.SetInsertPoint(IfBlock); |
844 | | // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); |
845 | 41 | llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( |
846 | 41 | RegisterFatbinFunc, |
847 | 41 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); |
848 | 41 | CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); |
849 | 41 | CtorBuilder.CreateBr(ExitBlock); |
850 | 41 | } |
851 | 41 | { |
852 | 41 | CtorBuilder.SetInsertPoint(ExitBlock); |
853 | | // Call __hip_register_globals(GpuBinaryHandle); |
854 | 41 | if (RegisterGlobalsFunc) { |
855 | 41 | auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); |
856 | 41 | CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); |
857 | 41 | } |
858 | 41 | } |
859 | 41 | } else if (10 !RelocatableDeviceCode10 ) { |
860 | | // Register binary with CUDA runtime. This is substantially different in |
861 | | // default mode vs. separate compilation! |
862 | | // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); |
863 | 6 | llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( |
864 | 6 | RegisterFatbinFunc, |
865 | 6 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); |
866 | 6 | GpuBinaryHandle = new llvm::GlobalVariable( |
867 | 6 | TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, |
868 | 6 | llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); |
869 | 6 | GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign()); |
870 | 6 | CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, |
871 | 6 | CGM.getPointerAlign()); |
872 | | |
873 | | // Call __cuda_register_globals(GpuBinaryHandle); |
874 | 6 | if (RegisterGlobalsFunc) |
875 | 6 | CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); |
876 | | |
877 | | // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it. |
878 | 6 | if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(), |
879 | 6 | CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) { |
880 | | // void __cudaRegisterFatBinaryEnd(void **); |
881 | 0 | llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction( |
882 | 0 | llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), |
883 | 0 | "__cudaRegisterFatBinaryEnd"); |
884 | 0 | CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall); |
885 | 0 | } |
886 | 6 | } else { |
887 | | // Generate a unique module ID. |
888 | 4 | SmallString<64> ModuleID; |
889 | 4 | llvm::raw_svector_ostream OS(ModuleID); |
890 | 4 | OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID()); |
891 | 4 | llvm::Constant *ModuleIDConstant = makeConstantString( |
892 | 4 | std::string(ModuleID.str()), "", ModuleIDSectionName, 32); |
893 | | |
894 | | // Create an alias for the FatbinWrapper that nvcc will look for. |
895 | 4 | llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, |
896 | 4 | Twine("__fatbinwrap") + ModuleID, FatbinWrapper); |
897 | | |
898 | | // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, |
899 | | // void *, void (*)(void **)) |
900 | 4 | SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); |
901 | 4 | RegisterLinkedBinaryName += ModuleID; |
902 | 4 | llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( |
903 | 4 | getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); |
904 | | |
905 | 4 | assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); |
906 | 0 | llvm::Value *Args[] = {RegisterGlobalsFunc, |
907 | 4 | CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), |
908 | 4 | ModuleIDConstant, |
909 | 4 | makeDummyFunction(getCallbackFnTy())}; |
910 | 4 | CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); |
911 | 4 | } |
912 | | |
913 | | // Create destructor and register it with atexit() the way NVCC does it. Doing |
914 | | // it during regular destructor phase worked in CUDA before 9.2 but results in |
915 | | // double-free in 9.2. |
916 | 51 | if (llvm::Function *CleanupFn = makeModuleDtorFunction()) { |
917 | | // extern "C" int atexit(void (*f)(void)); |
918 | 47 | llvm::FunctionType *AtExitTy = |
919 | 47 | llvm::FunctionType::get(IntTy, CleanupFn->getType(), false); |
920 | 47 | llvm::FunctionCallee AtExitFunc = |
921 | 47 | CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(), |
922 | 47 | /*Local=*/true); |
923 | 47 | CtorBuilder.CreateCall(AtExitFunc, CleanupFn); |
924 | 47 | } |
925 | | |
926 | 51 | CtorBuilder.CreateRetVoid(); |
927 | 51 | return ModuleCtorFunc; |
928 | 51 | } |
929 | | |
930 | | /// Creates a global destructor function that unregisters the GPU code blob |
931 | | /// registered by constructor. |
932 | | /// |
933 | | /// For CUDA: |
934 | | /// \code |
935 | | /// void __cuda_module_dtor() { |
936 | | /// __cudaUnregisterFatBinary(Handle); |
937 | | /// } |
938 | | /// \endcode |
939 | | /// |
940 | | /// For HIP: |
941 | | /// \code |
942 | | /// void __hip_module_dtor() { |
943 | | /// if (__hip_gpubin_handle) { |
944 | | /// __hipUnregisterFatBinary(__hip_gpubin_handle); |
945 | | /// __hip_gpubin_handle = 0; |
946 | | /// } |
947 | | /// } |
948 | | /// \endcode |
949 | 51 | llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { |
950 | | // No need for destructor if we don't have a handle to unregister. |
951 | 51 | if (!GpuBinaryHandle) |
952 | 4 | return nullptr; |
953 | | |
954 | | // void __cudaUnregisterFatBinary(void ** handle); |
955 | 47 | llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction( |
956 | 47 | llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), |
957 | 47 | addUnderscoredPrefixToName("UnregisterFatBinary")); |
958 | | |
959 | 47 | llvm::Function *ModuleDtorFunc = llvm::Function::Create( |
960 | 47 | llvm::FunctionType::get(VoidTy, false), |
961 | 47 | llvm::GlobalValue::InternalLinkage, |
962 | 47 | addUnderscoredPrefixToName("_module_dtor"), &TheModule); |
963 | | |
964 | 47 | llvm::BasicBlock *DtorEntryBB = |
965 | 47 | llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); |
966 | 47 | CGBuilderTy DtorBuilder(CGM, Context); |
967 | 47 | DtorBuilder.SetInsertPoint(DtorEntryBB); |
968 | | |
969 | 47 | Address GpuBinaryAddr( |
970 | 47 | GpuBinaryHandle, GpuBinaryHandle->getValueType(), |
971 | 47 | CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); |
972 | 47 | auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); |
973 | | // There is only one HIP fat binary per linked module, however there are |
974 | | // multiple destructor functions. Make sure the fat binary is unregistered |
975 | | // only once. |
976 | 47 | if (CGM.getLangOpts().HIP) { |
977 | 41 | llvm::BasicBlock *IfBlock = |
978 | 41 | llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); |
979 | 41 | llvm::BasicBlock *ExitBlock = |
980 | 41 | llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); |
981 | 41 | llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); |
982 | 41 | llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); |
983 | 41 | DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); |
984 | | |
985 | 41 | DtorBuilder.SetInsertPoint(IfBlock); |
986 | 41 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
987 | 41 | DtorBuilder.CreateStore(Zero, GpuBinaryAddr); |
988 | 41 | DtorBuilder.CreateBr(ExitBlock); |
989 | | |
990 | 41 | DtorBuilder.SetInsertPoint(ExitBlock); |
991 | 41 | } else { |
992 | 6 | DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); |
993 | 6 | } |
994 | 47 | DtorBuilder.CreateRetVoid(); |
995 | 47 | return ModuleDtorFunc; |
996 | 51 | } |
997 | | |
998 | 216 | CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { |
999 | 216 | return new CGNVCUDARuntime(CGM); |
1000 | 216 | } |
1001 | | |
1002 | | void CGNVCUDARuntime::internalizeDeviceSideVar( |
1003 | 186 | const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { |
1004 | | // For -fno-gpu-rdc, host-side shadows of external declarations of device-side |
1005 | | // global variables become internal definitions. These have to be internal in |
1006 | | // order to prevent name conflicts with global host variables with the same |
1007 | | // name in a different TUs. |
1008 | | // |
1009 | | // For -fgpu-rdc, the shadow variables should not be internalized because |
1010 | | // they may be accessed by different TU. |
1011 | 186 | if (CGM.getLangOpts().GPURelocatableDeviceCode) |
1012 | 54 | return; |
1013 | | |
1014 | | // __shared__ variables are odd. Shadows do get created, but |
1015 | | // they are not registered with the CUDA runtime, so they |
1016 | | // can't really be used to access their device-side |
1017 | | // counterparts. It's not clear yet whether it's nvcc's bug or |
1018 | | // a feature, but we've got to do the same for compatibility. |
1019 | 132 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()74 || |
1020 | 132 | D->hasAttr<CUDASharedAttr>()43 || |
1021 | 132 | D->getType()->isCUDADeviceBuiltinSurfaceType()33 || |
1022 | 132 | D->getType()->isCUDADeviceBuiltinTextureType()33 ) { |
1023 | 99 | Linkage = llvm::GlobalValue::InternalLinkage; |
1024 | 99 | } |
1025 | 132 | } |
1026 | | |
1027 | | void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D, |
1028 | 371 | llvm::GlobalVariable &GV) { |
1029 | 371 | if (D->hasAttr<CUDADeviceAttr>() || D->hasAttr<CUDAConstantAttr>()189 ) { |
1030 | | // Shadow variables and their properties must be registered with CUDA |
1031 | | // runtime. Skip Extern global variables, which will be registered in |
1032 | | // the TU where they are defined. |
1033 | | // |
1034 | | // Don't register a C++17 inline variable. The local symbol can be |
1035 | | // discarded and referencing a discarded local symbol from outside the |
1036 | | // comdat (__cuda_register_globals) is disallowed by the ELF spec. |
1037 | | // |
1038 | | // HIP managed variables need to be always recorded in device and host |
1039 | | // compilations for transformation. |
1040 | | // |
1041 | | // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are |
1042 | | // added to llvm.compiler-used, therefore they are safe to be registered. |
1043 | 306 | if ((!D->hasExternalStorage() && !D->isInline()298 ) || |
1044 | 306 | CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D)24 || |
1045 | 306 | D->hasAttr<HIPManagedAttr>()12 ) { |
1046 | 302 | registerDeviceVar(D, GV, !D->hasDefinition(), |
1047 | 302 | D->hasAttr<CUDAConstantAttr>()); |
1048 | 302 | } |
1049 | 306 | } else if (65 D->getType()->isCUDADeviceBuiltinSurfaceType()65 || |
1050 | 65 | D->getType()->isCUDADeviceBuiltinTextureType()) { |
1051 | | // Builtin surfaces and textures and their template arguments are |
1052 | | // also registered with CUDA runtime. |
1053 | 0 | const auto *TD = cast<ClassTemplateSpecializationDecl>( |
1054 | 0 | D->getType()->castAs<RecordType>()->getDecl()); |
1055 | 0 | const TemplateArgumentList &Args = TD->getTemplateArgs(); |
1056 | 0 | if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) { |
1057 | 0 | assert(Args.size() == 2 && |
1058 | 0 | "Unexpected number of template arguments of CUDA device " |
1059 | 0 | "builtin surface type."); |
1060 | 0 | auto SurfType = Args[1].getAsIntegral(); |
1061 | 0 | if (!D->hasExternalStorage()) |
1062 | 0 | registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue()); |
1063 | 0 | } else { |
1064 | 0 | assert(Args.size() == 3 && |
1065 | 0 | "Unexpected number of template arguments of CUDA device " |
1066 | 0 | "builtin texture type."); |
1067 | 0 | auto TexType = Args[1].getAsIntegral(); |
1068 | 0 | auto Normalized = Args[2].getAsIntegral(); |
1069 | 0 | if (!D->hasExternalStorage()) |
1070 | 0 | registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(), |
1071 | 0 | Normalized.getZExtValue()); |
1072 | 0 | } |
1073 | 0 | } |
1074 | 371 | } |
1075 | | |
1076 | | // Transform managed variables to pointers to managed variables in device code. |
1077 | | // Each use of the original managed variable is replaced by a load from the |
1078 | | // transformed managed variable. The transformed managed variable contains |
1079 | | // the address of managed memory which will be allocated by the runtime. |
1080 | 128 | void CGNVCUDARuntime::transformManagedVars() { |
1081 | 171 | for (auto &&Info : DeviceVars) { |
1082 | 171 | llvm::GlobalVariable *Var = Info.Var; |
1083 | 171 | if (Info.Flags.getKind() == DeviceVarFlags::Variable && |
1084 | 171 | Info.Flags.isManaged()) { |
1085 | 18 | auto *ManagedVar = new llvm::GlobalVariable( |
1086 | 18 | CGM.getModule(), Var->getType(), |
1087 | 18 | /*isConstant=*/false, Var->getLinkage(), |
1088 | 18 | /*Init=*/Var->isDeclaration() |
1089 | 18 | ? nullptr4 |
1090 | 18 | : llvm::ConstantPointerNull::get(Var->getType())14 , |
1091 | 18 | /*Name=*/"", /*InsertBefore=*/nullptr, |
1092 | 18 | llvm::GlobalVariable::NotThreadLocal, |
1093 | 18 | CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); |
1094 | 18 | ManagedVar->setDSOLocal(Var->isDSOLocal()); |
1095 | 18 | ManagedVar->setVisibility(Var->getVisibility()); |
1096 | 18 | ManagedVar->setExternallyInitialized(true); |
1097 | 18 | replaceManagedVar(Var, ManagedVar); |
1098 | 18 | ManagedVar->takeName(Var); |
1099 | 18 | Var->setName(Twine(ManagedVar->getName()) + ".managed"); |
1100 | | // Keep managed variables even if they are not used in device code since |
1101 | | // they need to be allocated by the runtime. |
1102 | 18 | if (!Var->isDeclaration()) { |
1103 | 14 | assert(!ManagedVar->isDeclaration()); |
1104 | 0 | CGM.addCompilerUsedGlobal(Var); |
1105 | 14 | CGM.addCompilerUsedGlobal(ManagedVar); |
1106 | 14 | } |
1107 | 18 | } |
1108 | 171 | } |
1109 | 128 | } |
1110 | | |
1111 | | // Creates offloading entries for all the kernels and globals that must be |
1112 | | // registered. The linker will provide a pointer to this section so we can |
1113 | | // register the symbols with the linked device image. |
1114 | 2 | void CGNVCUDARuntime::createOffloadingEntries() { |
1115 | 2 | llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule()); |
1116 | 2 | OMPBuilder.initialize(); |
1117 | | |
1118 | 2 | StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"1 |
1119 | 2 | : "cuda_offloading_entries"1 ; |
1120 | 2 | for (KernelInfo &I : EmittedKernels) |
1121 | 4 | OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel], |
1122 | 4 | getDeviceSideName(cast<NamedDecl>(I.D)), 0, |
1123 | 4 | DeviceVarFlags::OffloadGlobalEntry, Section); |
1124 | | |
1125 | 2 | for (VarInfo &I : DeviceVars) { |
1126 | 2 | uint64_t VarSize = |
1127 | 2 | CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType()); |
1128 | 2 | if (I.Flags.getKind() == DeviceVarFlags::Variable) { |
1129 | 2 | OMPBuilder.emitOffloadingEntry( |
1130 | 2 | I.Var, getDeviceSideName(I.D), VarSize, |
1131 | 2 | I.Flags.isManaged() ? DeviceVarFlags::OffloadGlobalManagedEntry0 |
1132 | 2 | : DeviceVarFlags::OffloadGlobalEntry, |
1133 | 2 | Section); |
1134 | 2 | } else if (0 I.Flags.getKind() == DeviceVarFlags::Surface0 ) { |
1135 | 0 | OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, |
1136 | 0 | DeviceVarFlags::OffloadGlobalSurfaceEntry, |
1137 | 0 | Section); |
1138 | 0 | } else if (I.Flags.getKind() == DeviceVarFlags::Texture) { |
1139 | 0 | OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize, |
1140 | 0 | DeviceVarFlags::OffloadGlobalTextureEntry, |
1141 | 0 | Section); |
1142 | 0 | } |
1143 | 2 | } |
1144 | 2 | } |
1145 | | |
1146 | | // Returns module constructor to be added. |
1147 | 201 | llvm::Function *CGNVCUDARuntime::finalizeModule() { |
1148 | 201 | if (CGM.getLangOpts().CUDAIsDevice) { |
1149 | 128 | transformManagedVars(); |
1150 | | |
1151 | | // Mark ODR-used device variables as compiler used to prevent it from being |
1152 | | // eliminated by optimization. This is necessary for device variables |
1153 | | // ODR-used by host functions. Sema correctly marks them as ODR-used no |
1154 | | // matter whether they are ODR-used by device or host functions. |
1155 | | // |
1156 | | // We do not need to do this if the variable has used attribute since it |
1157 | | // has already been added. |
1158 | | // |
1159 | | // Static device variables have been externalized at this point, therefore |
1160 | | // variables with LLVM private or internal linkage need not be added. |
1161 | 171 | for (auto &&Info : DeviceVars) { |
1162 | 171 | auto Kind = Info.Flags.getKind(); |
1163 | 171 | if (!Info.Var->isDeclaration() && |
1164 | 171 | !llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage())167 && |
1165 | 171 | (132 Kind == DeviceVarFlags::Variable132 || |
1166 | 132 | Kind == DeviceVarFlags::Surface0 || |
1167 | 132 | Kind == DeviceVarFlags::Texture0 ) && |
1168 | 171 | Info.D->isUsed()132 && !Info.D->hasAttr<UsedAttr>()85 ) { |
1169 | 82 | CGM.addCompilerUsedGlobal(Info.Var); |
1170 | 82 | } |
1171 | 171 | } |
1172 | 128 | return nullptr; |
1173 | 128 | } |
1174 | 73 | if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode2 ) |
1175 | 2 | createOffloadingEntries(); |
1176 | 71 | else |
1177 | 71 | return makeModuleCtorFunction(); |
1178 | | |
1179 | 2 | return nullptr; |
1180 | 73 | } |
1181 | | |
1182 | | llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F, |
1183 | 186 | GlobalDecl GD) { |
1184 | 186 | auto Loc = KernelHandles.find(F); |
1185 | 186 | if (Loc != KernelHandles.end()) |
1186 | 95 | return Loc->second; |
1187 | | |
1188 | 91 | if (!CGM.getLangOpts().HIP) { |
1189 | 25 | KernelHandles[F] = F; |
1190 | 25 | KernelStubs[F] = F; |
1191 | 25 | return F; |
1192 | 25 | } |
1193 | | |
1194 | 66 | auto *Var = new llvm::GlobalVariable( |
1195 | 66 | TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(), |
1196 | 66 | /*Initializer=*/nullptr, |
1197 | 66 | CGM.getMangledName( |
1198 | 66 | GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel))); |
1199 | 66 | Var->setAlignment(CGM.getPointerAlign().getAsAlign()); |
1200 | 66 | Var->setDSOLocal(F->isDSOLocal()); |
1201 | 66 | Var->setVisibility(F->getVisibility()); |
1202 | 66 | CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var); |
1203 | 66 | KernelHandles[F] = Var; |
1204 | 66 | KernelStubs[Var] = F; |
1205 | 66 | return Var; |
1206 | 91 | } |