Coverage Report

Created: 2021-01-26 06:56

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line
Count
Source (jump to first uncovered line)
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "CGCUDARuntime.h"
15
#include "CodeGenFunction.h"
16
#include "CodeGenModule.h"
17
#include "clang/AST/Decl.h"
18
#include "clang/Basic/Cuda.h"
19
#include "clang/CodeGen/CodeGenABITypes.h"
20
#include "clang/CodeGen/ConstantInitBuilder.h"
21
#include "llvm/IR/BasicBlock.h"
22
#include "llvm/IR/Constants.h"
23
#include "llvm/IR/DerivedTypes.h"
24
#include "llvm/IR/ReplaceConstant.h"
25
#include "llvm/Support/Format.h"
26
27
using namespace clang;
28
using namespace CodeGen;
29
30
namespace {
31
constexpr unsigned CudaFatMagic = 0x466243b1;
32
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
33
34
class CGNVCUDARuntime : public CGCUDARuntime {
35
36
private:
37
  llvm::IntegerType *IntTy, *SizeTy;
38
  llvm::Type *VoidTy;
39
  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
40
41
  /// Convenience reference to LLVM Context
42
  llvm::LLVMContext &Context;
43
  /// Convenience reference to the current module
44
  llvm::Module &TheModule;
45
  /// Keeps track of kernel launch stubs emitted in this module
46
  struct KernelInfo {
47
    llvm::Function *Kernel;
48
    const Decl *D;
49
  };
50
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
51
  struct VarInfo {
52
    llvm::GlobalVariable *Var;
53
    const VarDecl *D;
54
    DeviceVarFlags Flags;
55
  };
56
  llvm::SmallVector<VarInfo, 16> DeviceVars;
57
  /// Keeps track of variable containing handle of GPU binary. Populated by
58
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
59
  /// ModuleDtorFunction()
60
  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
61
  /// Whether we generate relocatable device code.
62
  bool RelocatableDeviceCode;
63
  /// Mangle context for device.
64
  std::unique_ptr<MangleContext> DeviceMC;
65
66
  llvm::FunctionCallee getSetupArgumentFn() const;
67
  llvm::FunctionCallee getLaunchFn() const;
68
69
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
70
  llvm::FunctionType *getCallbackFnTy() const;
71
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
72
  std::string addPrefixToName(StringRef FuncName) const;
73
  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
74
75
  /// Creates a function to register all kernel stubs generated in this module.
76
  llvm::Function *makeRegisterGlobalsFn();
77
78
  /// Helper function that generates a constant string and returns a pointer to
79
  /// the start of the string.  The result of this function can be used anywhere
80
  /// where the C code specifies const char*.
81
  llvm::Constant *makeConstantString(const std::string &Str,
82
                                     const std::string &Name = "",
83
                                     const std::string &SectionName = "",
84
113
                                     unsigned Alignment = 0) {
85
113
    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
86
113
                               llvm::ConstantInt::get(SizeTy, 0)};
87
113
    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
88
113
    llvm::GlobalVariable *GV =
89
113
        cast<llvm::GlobalVariable>(ConstStr.getPointer());
90
113
    if (!SectionName.empty()) {
91
20
      GV->setSection(SectionName);
92
      // Mark the address as used which make sure that this section isn't
93
      // merged and we will really have it in the object file.
94
20
      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
95
20
    }
96
113
    if (Alignment)
97
20
      GV->setAlignment(llvm::Align(Alignment));
98
99
113
    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
100
113
                                                ConstStr.getPointer(), Zeros);
101
113
  }
102
103
  /// Helper function that generates an empty dummy function returning void.
104
3
  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
105
3
    assert(FnTy->getReturnType()->isVoidTy() &&
106
3
           "Can only generate dummy functions returning void!");
107
3
    llvm::Function *DummyFunc = llvm::Function::Create(
108
3
        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
109
110
3
    llvm::BasicBlock *DummyBlock =
111
3
        llvm::BasicBlock::Create(Context, "", DummyFunc);
112
3
    CGBuilderTy FuncBuilder(CGM, Context);
113
3
    FuncBuilder.SetInsertPoint(DummyBlock);
114
3
    FuncBuilder.CreateRetVoid();
115
116
3
    return DummyFunc;
117
3
  }
118
119
  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
120
  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
121
  std::string getDeviceSideName(const NamedDecl *ND) override;
122
123
public:
124
  CGNVCUDARuntime(CodeGenModule &CGM);
125
126
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
127
  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
128
86
                         bool Extern, bool Constant) override {
129
86
    DeviceVars.push_back({&Var,
130
86
                          VD,
131
86
                          {DeviceVarFlags::Variable, Extern, Constant,
132
86
                           VD->hasAttr<HIPManagedAttr>(),
133
86
                           /*Normalized*/ false, 0}});
134
86
  }
135
  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
136
1
                          bool Extern, int Type) override {
137
1
    DeviceVars.push_back({&Var,
138
1
                          VD,
139
1
                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
140
1
                           /*Managed*/ false,
141
1
                           /*Normalized*/ false, Type}});
142
1
  }
143
  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
144
2
                         bool Extern, int Type, bool Normalized) override {
145
2
    DeviceVars.push_back({&Var,
146
2
                          VD,
147
2
                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
148
2
                           /*Managed*/ false, Normalized, Type}});
149
2
  }
150
151
  /// Creates module constructor function
152
  llvm::Function *makeModuleCtorFunction() override;
153
  /// Creates module destructor function
154
  llvm::Function *makeModuleDtorFunction() override;
155
};
156
157
}
158
159
51
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
160
51
  if (CGM.getLangOpts().HIP)
161
32
    return ((Twine("hip") + Twine(FuncName)).str());
162
19
  return ((Twine("cuda") + Twine(FuncName)).str());
163
19
}
164
std::string
165
308
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
166
308
  if (CGM.getLangOpts().HIP)
167
199
    return ((Twine("__hip") + Twine(FuncName)).str());
168
109
  return ((Twine("__cuda") + Twine(FuncName)).str());
169
109
}
170
171
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
172
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
173
      TheModule(CGM.getModule()),
174
      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
175
      DeviceMC(CGM.getContext().createMangleContext(
176
181
          CGM.getContext().getAuxTargetInfo())) {
177
181
  CodeGen::CodeGenTypes &Types = CGM.getTypes();
178
181
  ASTContext &Ctx = CGM.getContext();
179
180
181
  IntTy = CGM.IntTy;
181
181
  SizeTy = CGM.SizeTy;
182
181
  VoidTy = CGM.VoidTy;
183
184
181
  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
185
181
  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
186
181
  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
187
181
}
188
189
45
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
190
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
191
45
  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
192
45
  return CGM.CreateRuntimeFunction(
193
45
      llvm::FunctionType::get(IntTy, Params, false),
194
45
      addPrefixToName("SetupArgument"));
195
45
}
196
197
45
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
198
45
  if (CGM.getLangOpts().HIP) {
199
    // hipError_t hipLaunchByPtr(char *);
200
31
    return CGM.CreateRuntimeFunction(
201
31
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
202
14
  } else {
203
    // cudaError_t cudaLaunch(char *);
204
14
    return CGM.CreateRuntimeFunction(
205
14
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
206
14
  }
207
45
}
208
209
31
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
210
31
  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
211
31
}
212
213
6
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
214
6
  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
215
6
}
216
217
3
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
218
3
  auto CallbackFnTy = getCallbackFnTy();
219
3
  auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
220
3
  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
221
3
                          VoidPtrTy, CallbackFnTy->getPointerTo()};
222
3
  return llvm::FunctionType::get(VoidTy, Params, false);
223
3
}
224
225
146
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
226
146
  GlobalDecl GD;
227
  // D could be either a kernel or a variable.
228
146
  if (auto *FD = dyn_cast<FunctionDecl>(ND))
229
93
    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
230
53
  else
231
53
    GD = GlobalDecl(ND);
232
146
  std::string DeviceSideName;
233
146
  if (DeviceMC->shouldMangleDeclName(ND)) {
234
91
    SmallString<256> Buffer;
235
91
    llvm::raw_svector_ostream Out(Buffer);
236
91
    DeviceMC->mangleName(GD, Out);
237
91
    DeviceSideName = std::string(Out.str());
238
91
  } else
239
55
    DeviceSideName = std::string(ND->getIdentifier()->getName());
240
146
  return DeviceSideName;
241
146
}
242
243
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
244
51
                                     FunctionArgList &Args) {
245
51
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
246
51
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
247
51
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
248
46
      (CGF.getLangOpts().HIP && 
CGF.getLangOpts().HIPUseNewLaunchAPI32
))
249
6
    emitDeviceStubBodyNew(CGF, Args);
250
45
  else
251
45
    emitDeviceStubBodyLegacy(CGF, Args);
252
51
}
253
254
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
255
// array and kernels are launched using cudaLaunchKernel().
256
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
257
6
                                            FunctionArgList &Args) {
258
  // Build the shadow stack entry at the very start of the function.
259
260
  // Calculate amount of space we will need for all arguments.  If we have no
261
  // args, allocate a single pointer so we still have a valid pointer to the
262
  // argument array that we can pass to runtime, even if it will be unused.
263
6
  Address KernelArgs = CGF.CreateTempAlloca(
264
6
      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
265
6
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
266
  // Store pointers to the arguments in a locally allocated launch_args.
267
20
  for (unsigned i = 0; i < Args.size(); 
++i14
) {
268
14
    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
269
14
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
270
14
    CGF.Builder.CreateDefaultAlignedStore(
271
14
        VoidVarPtr, CGF.Builder.CreateConstGEP1_32(KernelArgs.getPointer(), i));
272
14
  }
273
274
6
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
275
276
  // Lookup cudaLaunchKernel/hipLaunchKernel function.
277
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
278
  //                              void **args, size_t sharedMem,
279
  //                              cudaStream_t stream);
280
  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
281
  //                            void **args, size_t sharedMem,
282
  //                            hipStream_t stream);
283
6
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
284
6
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
285
6
  auto LaunchKernelName = addPrefixToName("LaunchKernel");
286
6
  IdentifierInfo &cudaLaunchKernelII =
287
6
      CGM.getContext().Idents.get(LaunchKernelName);
288
6
  FunctionDecl *cudaLaunchKernelFD = nullptr;
289
6
  for (const auto &Result : DC->lookup(&cudaLaunchKernelII)) {
290
6
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
291
6
      cudaLaunchKernelFD = FD;
292
6
  }
293
294
6
  if (cudaLaunchKernelFD == nullptr) {
295
0
    CGM.Error(CGF.CurFuncDecl->getLocation(),
296
0
              "Can't find declaration for " + LaunchKernelName);
297
0
    return;
298
0
  }
299
  // Create temporary dim3 grid_dim, block_dim.
300
6
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
301
6
  QualType Dim3Ty = GridDimParam->getType();
302
6
  Address GridDim =
303
6
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
304
6
  Address BlockDim =
305
6
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
306
6
  Address ShmemSize =
307
6
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
308
6
  Address Stream =
309
6
      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
310
6
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
311
6
      llvm::FunctionType::get(IntTy,
312
6
                              {/*gridDim=*/GridDim.getType(),
313
6
                               /*blockDim=*/BlockDim.getType(),
314
6
                               /*ShmemSize=*/ShmemSize.getType(),
315
6
                               /*Stream=*/Stream.getType()},
316
6
                              /*isVarArg=*/false),
317
6
      addUnderscoredPrefixToName("PopCallConfiguration"));
318
319
6
  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
320
6
                              {GridDim.getPointer(), BlockDim.getPointer(),
321
6
                               ShmemSize.getPointer(), Stream.getPointer()});
322
323
  // Emit the call to cudaLaunch
324
6
  llvm::Value *Kernel = CGF.Builder.CreatePointerCast(CGF.CurFn, VoidPtrTy);
325
6
  CallArgList LaunchKernelArgs;
326
6
  LaunchKernelArgs.add(RValue::get(Kernel),
327
6
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
328
6
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
329
6
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
330
6
  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
331
6
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
332
6
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
333
6
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
334
6
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
335
6
                       cudaLaunchKernelFD->getParamDecl(5)->getType());
336
337
6
  QualType QT = cudaLaunchKernelFD->getType();
338
6
  QualType CQT = QT.getCanonicalType();
339
6
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
340
6
  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
341
342
6
  const CGFunctionInfo &FI =
343
6
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
344
6
  llvm::FunctionCallee cudaLaunchKernelFn =
345
6
      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
346
6
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
347
6
               LaunchKernelArgs);
348
6
  CGF.EmitBranch(EndBlock);
349
350
6
  CGF.EmitBlock(EndBlock);
351
6
}
352
353
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
354
45
                                               FunctionArgList &Args) {
355
  // Emit a call to cudaSetupArgument for each arg in Args.
356
45
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
357
45
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
358
45
  CharUnits Offset = CharUnits::Zero();
359
52
  for (const VarDecl *A : Args) {
360
52
    auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
361
52
    Offset = Offset.alignTo(TInfo.Align);
362
52
    llvm::Value *Args[] = {
363
52
        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
364
52
                                      VoidPtrTy),
365
52
        llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
366
52
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
367
52
    };
368
52
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
369
52
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
370
52
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
371
52
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
372
52
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
373
52
    CGF.EmitBlock(NextBlock);
374
52
    Offset += TInfo.Width;
375
52
  }
376
377
  // Emit the call to cudaLaunch
378
45
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
379
45
  llvm::Value *Arg = CGF.Builder.CreatePointerCast(CGF.CurFn, CharPtrTy);
380
45
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
381
45
  CGF.EmitBranch(EndBlock);
382
383
45
  CGF.EmitBlock(EndBlock);
384
45
}
385
386
// Replace the original variable Var with the address loaded from variable
387
// ManagedVar populated by HIP runtime.
388
static void replaceManagedVar(llvm::GlobalVariable *Var,
389
6
                              llvm::GlobalVariable *ManagedVar) {
390
6
  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
391
12
  for (auto &&VarUse : Var->uses()) {
392
12
    WorkList.push_back({VarUse.getUser()});
393
12
  }
394
46
  while (!WorkList.empty()) {
395
40
    auto &&WorkItem = WorkList.pop_back_val();
396
40
    auto *U = WorkItem.back();
397
40
    if (isa<llvm::ConstantExpr>(U)) {
398
28
      for (auto &&UU : U->uses()) {
399
28
        WorkItem.push_back(UU.getUser());
400
28
        WorkList.push_back(WorkItem);
401
28
        WorkItem.pop_back();
402
28
      }
403
26
      continue;
404
26
    }
405
14
    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
406
14
      llvm::Value *OldV = Var;
407
14
      llvm::Instruction *NewV =
408
14
          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
409
14
                             llvm::Align(Var->getAlignment()), I);
410
14
      WorkItem.pop_back();
411
      // Replace constant expressions directly or indirectly using the managed
412
      // variable with instructions.
413
22
      for (auto &&Op : WorkItem) {
414
22
        auto *CE = cast<llvm::ConstantExpr>(Op);
415
22
        auto *NewInst = llvm::createReplacementInstr(CE, I);
416
22
        NewInst->replaceUsesOfWith(OldV, NewV);
417
22
        OldV = CE;
418
22
        NewV = NewInst;
419
22
      }
420
14
      I->replaceUsesOfWith(OldV, NewV);
421
0
    } else {
422
0
      llvm_unreachable("Invalid use of managed variable");
423
0
    }
424
14
  }
425
6
}
426
427
/// Creates a function that sets up state on the host side for CUDA objects that
428
/// have a presence on both the host and device sides. Specifically, registers
429
/// the host side of kernel functions and device global variables with the CUDA
430
/// runtime.
431
/// \code
432
/// void __cuda_register_globals(void** GpuBinaryHandle) {
433
///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
434
///    ...
435
///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
436
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
437
///    ...
438
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
439
/// }
440
/// \endcode
441
28
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
442
  // No need to register anything
443
28
  if (EmittedKernels.empty() && 
DeviceVars.empty()2
)
444
0
    return nullptr;
445
446
28
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
447
28
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
448
28
      addUnderscoredPrefixToName("_register_globals"), &TheModule);
449
28
  llvm::BasicBlock *EntryBB =
450
28
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
451
28
  CGBuilderTy Builder(CGM, Context);
452
28
  Builder.SetInsertPoint(EntryBB);
453
454
  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
455
  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
456
28
  llvm::Type *RegisterFuncParams[] = {
457
28
      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
458
28
      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
459
28
  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
460
28
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
461
28
      addUnderscoredPrefixToName("RegisterFunction"));
462
463
  // Extract GpuBinaryHandle passed as the first argument passed to
464
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
465
  // each emitted kernel.
466
28
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
467
40
  for (auto &&I : EmittedKernels) {
468
40
    llvm::Constant *KernelName =
469
40
        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
470
40
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
471
40
    llvm::Value *Args[] = {
472
40
        &GpuBinaryHandlePtr,
473
40
        Builder.CreateBitCast(I.Kernel, VoidPtrTy),
474
40
        KernelName,
475
40
        KernelName,
476
40
        llvm::ConstantInt::get(IntTy, -1),
477
40
        NullPtr,
478
40
        NullPtr,
479
40
        NullPtr,
480
40
        NullPtr,
481
40
        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
482
40
    Builder.CreateCall(RegisterFunc, Args);
483
40
  }
484
485
28
  llvm::Type *VarSizeTy = IntTy;
486
  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
487
28
  if (CGM.getLangOpts().HIP ||
488
10
      ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_90)
489
21
    VarSizeTy = SizeTy;
490
491
  // void __cudaRegisterVar(void **, char *, char *, const char *,
492
  //                        int, int, int, int)
493
28
  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
494
28
                                     CharPtrTy,    IntTy,     VarSizeTy,
495
28
                                     IntTy,        IntTy};
496
28
  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
497
28
      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
498
28
      addUnderscoredPrefixToName("RegisterVar"));
499
  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
500
  //                              size_t, unsigned)
501
28
  llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
502
28
                                            CharPtrTy,    VarSizeTy, IntTy};
503
28
  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
504
28
      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
505
28
      addUnderscoredPrefixToName("RegisterManagedVar"));
506
  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
507
  //                            const void **, const char *, int, int);
508
28
  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
509
28
      llvm::FunctionType::get(
510
28
          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
511
28
          false),
512
28
      addUnderscoredPrefixToName("RegisterSurface"));
513
  // void __cudaRegisterTexture(void **, const struct textureReference *,
514
  //                            const void **, const char *, int, int, int)
515
28
  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
516
28
      llvm::FunctionType::get(
517
28
          VoidTy,
518
28
          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
519
28
          false),
520
28
      addUnderscoredPrefixToName("RegisterTexture"));
521
53
  for (auto &&Info : DeviceVars) {
522
53
    llvm::GlobalVariable *Var = Info.Var;
523
53
    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
524
53
    switch (Info.Flags.getKind()) {
525
50
    case DeviceVarFlags::Variable: {
526
50
      uint64_t VarSize =
527
50
          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
528
50
      if (Info.Flags.isManaged()) {
529
6
        auto ManagedVar = new llvm::GlobalVariable(
530
6
            CGM.getModule(), Var->getType(),
531
6
            /*isConstant=*/false, Var->getLinkage(),
532
6
            /*Init=*/llvm::ConstantPointerNull::get(Var->getType()),
533
6
            Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr,
534
6
            llvm::GlobalVariable::NotThreadLocal);
535
6
        replaceManagedVar(Var, ManagedVar);
536
6
        llvm::Value *Args[] = {
537
6
            &GpuBinaryHandlePtr,
538
6
            Builder.CreateBitCast(ManagedVar, VoidPtrTy),
539
6
            Builder.CreateBitCast(Var, VoidPtrTy),
540
6
            VarName,
541
6
            llvm::ConstantInt::get(VarSizeTy, VarSize),
542
6
            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
543
6
        Builder.CreateCall(RegisterManagedVar, Args);
544
44
      } else {
545
44
        llvm::Value *Args[] = {
546
44
            &GpuBinaryHandlePtr,
547
44
            Builder.CreateBitCast(Var, VoidPtrTy),
548
44
            VarName,
549
44
            VarName,
550
44
            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
551
44
            llvm::ConstantInt::get(VarSizeTy, VarSize),
552
44
            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
553
44
            llvm::ConstantInt::get(IntTy, 0)};
554
44
        Builder.CreateCall(RegisterVar, Args);
555
44
      }
556
50
      break;
557
0
    }
558
1
    case DeviceVarFlags::Surface:
559
1
      Builder.CreateCall(
560
1
          RegisterSurf,
561
1
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
562
1
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
563
1
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
564
1
      break;
565
2
    case DeviceVarFlags::Texture:
566
2
      Builder.CreateCall(
567
2
          RegisterTex,
568
2
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
569
2
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
570
2
           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
571
2
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
572
2
      break;
573
53
    }
574
53
  }
575
576
28
  Builder.CreateRetVoid();
577
28
  return RegisterKernelsFunc;
578
28
}
579
580
/// Creates a global constructor function for the module:
581
///
582
/// For CUDA:
583
/// \code
584
/// void __cuda_module_ctor(void*) {
585
///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
586
///     __cuda_register_globals(Handle);
587
/// }
588
/// \endcode
589
///
590
/// For HIP:
591
/// \code
592
/// void __hip_module_ctor(void*) {
593
///     if (__hip_gpubin_handle == 0) {
594
///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
595
///         __hip_register_globals(__hip_gpubin_handle);
596
///     }
597
/// }
598
/// \endcode
599
52
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
600
52
  bool IsHIP = CGM.getLangOpts().HIP;
601
52
  bool IsCUDA = CGM.getLangOpts().CUDA;
602
  // No need to generate ctors/dtors if there is no GPU binary.
603
52
  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
604
52
  if (CudaGpuBinaryFileName.empty() && 
!IsHIP32
)
605
18
    return nullptr;
606
34
  if ((IsHIP || 
(12
IsCUDA12
&&
!RelocatableDeviceCode12
)) &&
EmittedKernels.empty()31
&&
607
8
      DeviceVars.empty())
608
6
    return nullptr;
609
610
  // void __{cuda|hip}_register_globals(void* handle);
611
28
  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
612
  // We always need a function to pass in as callback. Create a dummy
613
  // implementation if we don't need to register anything.
614
28
  if (RelocatableDeviceCode && 
!RegisterGlobalsFunc5
)
615
0
    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
616
617
  // void ** __{cuda|hip}RegisterFatBinary(void *);
618
28
  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
619
28
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
620
28
      addUnderscoredPrefixToName("RegisterFatBinary"));
621
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
622
28
  llvm::StructType *FatbinWrapperTy =
623
28
      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
624
625
  // Register GPU binary with the CUDA runtime, store returned handle in a
626
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
627
  // in destructor on exit. Then associate all known kernels with the GPU binary
628
  // handle so CUDA runtime can figure out what to call on the GPU side.
629
28
  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
630
28
  if (!CudaGpuBinaryFileName.empty()) {
631
17
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
632
17
        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
633
17
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
634
0
      CGM.getDiags().Report(diag::err_cannot_open_file)
635
0
          << CudaGpuBinaryFileName << EC.message();
636
0
      return nullptr;
637
0
    }
638
17
    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
639
17
  }
640
641
28
  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
642
28
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
643
28
      llvm::GlobalValue::InternalLinkage,
644
28
      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
645
28
  llvm::BasicBlock *CtorEntryBB =
646
28
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
647
28
  CGBuilderTy CtorBuilder(CGM, Context);
648
649
28
  CtorBuilder.SetInsertPoint(CtorEntryBB);
650
651
28
  const char *FatbinConstantName;
652
28
  const char *FatbinSectionName;
653
28
  const char *ModuleIDSectionName;
654
28
  StringRef ModuleIDPrefix;
655
28
  llvm::Constant *FatBinStr;
656
28
  unsigned FatMagic;
657
28
  if (IsHIP) {
658
18
    FatbinConstantName = ".hip_fatbin";
659
18
    FatbinSectionName = ".hipFatBinSegment";
660
661
18
    ModuleIDSectionName = "__hip_module_id";
662
18
    ModuleIDPrefix = "__hip_";
663
664
18
    if (CudaGpuBinary) {
665
      // If fatbin is available from early finalization, create a string
666
      // literal containing the fat binary loaded from the given file.
667
7
      const unsigned HIPCodeObjectAlign = 4096;
668
7
      FatBinStr =
669
7
          makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
670
7
                             FatbinConstantName, HIPCodeObjectAlign);
671
11
    } else {
672
      // If fatbin is not available, create an external symbol
673
      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
674
      // to contain the fat binary but will be populated somewhere else,
675
      // e.g. by lld through link script.
676
11
      FatBinStr = new llvm::GlobalVariable(
677
11
        CGM.getModule(), CGM.Int8Ty,
678
11
        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
679
11
        "__hip_fatbin", nullptr,
680
11
        llvm::GlobalVariable::NotThreadLocal);
681
11
      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
682
11
    }
683
684
18
    FatMagic = HIPFatMagic;
685
10
  } else {
686
10
    if (RelocatableDeviceCode)
687
3
      FatbinConstantName = CGM.getTriple().isMacOSX()
688
0
                               ? "__NV_CUDA,__nv_relfatbin"
689
3
                               : "__nv_relfatbin";
690
7
    else
691
7
      FatbinConstantName =
692
7
          CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__nv_fatbin"0
: ".nv_fatbin";
693
    // NVIDIA's cuobjdump looks for fatbins in this section.
694
10
    FatbinSectionName =
695
10
        CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__fatbin"0
: ".nvFatBinSegment";
696
697
10
    ModuleIDSectionName = CGM.getTriple().isMacOSX()
698
0
                              ? "__NV_CUDA,__nv_module_id"
699
10
                              : "__nv_module_id";
700
10
    ModuleIDPrefix = "__nv_";
701
702
    // For CUDA, create a string literal containing the fat binary loaded from
703
    // the given file.
704
10
    FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
705
10
                                   FatbinConstantName, 8);
706
10
    FatMagic = CudaFatMagic;
707
10
  }
708
709
  // Create initialized wrapper structure that points to the loaded GPU binary
710
28
  ConstantInitBuilder Builder(CGM);
711
28
  auto Values = Builder.beginStruct(FatbinWrapperTy);
712
  // Fatbin wrapper magic.
713
28
  Values.addInt(IntTy, FatMagic);
714
  // Fatbin version.
715
28
  Values.addInt(IntTy, 1);
716
  // Data.
717
28
  Values.add(FatBinStr);
718
  // Unused in fatbin v1.
719
28
  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
720
28
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
721
28
      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
722
28
      /*constant*/ true);
723
28
  FatbinWrapper->setSection(FatbinSectionName);
724
725
  // There is only one HIP fat binary per linked module, however there are
726
  // multiple constructor functions. Make sure the fat binary is registered
727
  // only once. The constructor functions are executed by the dynamic loader
728
  // before the program gains control. The dynamic loader cannot execute the
729
  // constructor functions concurrently since doing that would not guarantee
730
  // thread safety of the loaded program. Therefore we can assume sequential
731
  // execution of constructor functions here.
732
28
  if (IsHIP) {
733
7
    auto Linkage = CudaGpuBinary ? llvm::GlobalValue::InternalLinkage :
734
11
        llvm::GlobalValue::LinkOnceAnyLinkage;
735
18
    llvm::BasicBlock *IfBlock =
736
18
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
737
18
    llvm::BasicBlock *ExitBlock =
738
18
        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
739
    // The name, size, and initialization pattern of this variable is part
740
    // of HIP ABI.
741
18
    GpuBinaryHandle = new llvm::GlobalVariable(
742
18
        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
743
18
        Linkage,
744
18
        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
745
18
        "__hip_gpubin_handle");
746
18
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
747
    // Prevent the weak symbol in different shared libraries being merged.
748
18
    if (Linkage != llvm::GlobalValue::InternalLinkage)
749
11
      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
750
18
    Address GpuBinaryAddr(
751
18
        GpuBinaryHandle,
752
18
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
753
18
    {
754
18
      auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
755
18
      llvm::Constant *Zero =
756
18
          llvm::Constant::getNullValue(HandleValue->getType());
757
18
      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
758
18
      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
759
18
    }
760
18
    {
761
18
      CtorBuilder.SetInsertPoint(IfBlock);
762
      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
763
18
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
764
18
          RegisterFatbinFunc,
765
18
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
766
18
      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
767
18
      CtorBuilder.CreateBr(ExitBlock);
768
18
    }
769
18
    {
770
18
      CtorBuilder.SetInsertPoint(ExitBlock);
771
      // Call __hip_register_globals(GpuBinaryHandle);
772
18
      if (RegisterGlobalsFunc) {
773
18
        auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
774
18
        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
775
18
      }
776
18
    }
777
10
  } else if (!RelocatableDeviceCode) {
778
    // Register binary with CUDA runtime. This is substantially different in
779
    // default mode vs. separate compilation!
780
    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
781
7
    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
782
7
        RegisterFatbinFunc,
783
7
        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
784
7
    GpuBinaryHandle = new llvm::GlobalVariable(
785
7
        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
786
7
        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
787
7
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
788
7
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
789
7
                                   CGM.getPointerAlign());
790
791
    // Call __cuda_register_globals(GpuBinaryHandle);
792
7
    if (RegisterGlobalsFunc)
793
7
      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
794
795
    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
796
7
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
797
0
                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
798
      // void __cudaRegisterFatBinaryEnd(void **);
799
0
      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
800
0
          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
801
0
          "__cudaRegisterFatBinaryEnd");
802
0
      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
803
0
    }
804
3
  } else {
805
    // Generate a unique module ID.
806
3
    SmallString<64> ModuleID;
807
3
    llvm::raw_svector_ostream OS(ModuleID);
808
3
    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
809
3
    llvm::Constant *ModuleIDConstant = makeConstantString(
810
3
        std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
811
812
    // Create an alias for the FatbinWrapper that nvcc will look for.
813
3
    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
814
3
                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
815
816
    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
817
    // void *, void (*)(void **))
818
3
    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
819
3
    RegisterLinkedBinaryName += ModuleID;
820
3
    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
821
3
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
822
823
3
    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
824
3
    llvm::Value *Args[] = {RegisterGlobalsFunc,
825
3
                           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
826
3
                           ModuleIDConstant,
827
3
                           makeDummyFunction(getCallbackFnTy())};
828
3
    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
829
3
  }
830
831
  // Create destructor and register it with atexit() the way NVCC does it. Doing
832
  // it during regular destructor phase worked in CUDA before 9.2 but results in
833
  // double-free in 9.2.
834
28
  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
835
    // extern "C" int atexit(void (*f)(void));
836
25
    llvm::FunctionType *AtExitTy =
837
25
        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
838
25
    llvm::FunctionCallee AtExitFunc =
839
25
        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
840
25
                                  /*Local=*/true);
841
25
    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
842
25
  }
843
844
28
  CtorBuilder.CreateRetVoid();
845
28
  return ModuleCtorFunc;
846
28
}
847
848
/// Creates a global destructor function that unregisters the GPU code blob
849
/// registered by constructor.
850
///
851
/// For CUDA:
852
/// \code
853
/// void __cuda_module_dtor(void*) {
854
///     __cudaUnregisterFatBinary(Handle);
855
/// }
856
/// \endcode
857
///
858
/// For HIP:
859
/// \code
860
/// void __hip_module_dtor(void*) {
861
///     if (__hip_gpubin_handle) {
862
///         __hipUnregisterFatBinary(__hip_gpubin_handle);
863
///         __hip_gpubin_handle = 0;
864
///     }
865
/// }
866
/// \endcode
867
28
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
868
  // No need for destructor if we don't have a handle to unregister.
869
28
  if (!GpuBinaryHandle)
870
3
    return nullptr;
871
872
  // void __cudaUnregisterFatBinary(void ** handle);
873
25
  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
874
25
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
875
25
      addUnderscoredPrefixToName("UnregisterFatBinary"));
876
877
25
  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
878
25
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
879
25
      llvm::GlobalValue::InternalLinkage,
880
25
      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
881
882
25
  llvm::BasicBlock *DtorEntryBB =
883
25
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
884
25
  CGBuilderTy DtorBuilder(CGM, Context);
885
25
  DtorBuilder.SetInsertPoint(DtorEntryBB);
886
887
25
  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
888
25
                                             GpuBinaryHandle->getAlignment()));
889
25
  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
890
  // There is only one HIP fat binary per linked module, however there are
891
  // multiple destructor functions. Make sure the fat binary is unregistered
892
  // only once.
893
25
  if (CGM.getLangOpts().HIP) {
894
18
    llvm::BasicBlock *IfBlock =
895
18
        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
896
18
    llvm::BasicBlock *ExitBlock =
897
18
        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
898
18
    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
899
18
    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
900
18
    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
901
902
18
    DtorBuilder.SetInsertPoint(IfBlock);
903
18
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
904
18
    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
905
18
    DtorBuilder.CreateBr(ExitBlock);
906
907
18
    DtorBuilder.SetInsertPoint(ExitBlock);
908
7
  } else {
909
7
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
910
7
  }
911
25
  DtorBuilder.CreateRetVoid();
912
25
  return ModuleDtorFunc;
913
25
}
914
915
181
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
916
181
  return new CGNVCUDARuntime(CGM);
917
181
}