Coverage Report

Created: 2022-01-25 06:29

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line
Count
Source (jump to first uncovered line)
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "CGCUDARuntime.h"
15
#include "CGCXXABI.h"
16
#include "CodeGenFunction.h"
17
#include "CodeGenModule.h"
18
#include "clang/AST/Decl.h"
19
#include "clang/Basic/Cuda.h"
20
#include "clang/CodeGen/CodeGenABITypes.h"
21
#include "clang/CodeGen/ConstantInitBuilder.h"
22
#include "llvm/IR/BasicBlock.h"
23
#include "llvm/IR/Constants.h"
24
#include "llvm/IR/DerivedTypes.h"
25
#include "llvm/IR/ReplaceConstant.h"
26
#include "llvm/Support/Format.h"
27
28
using namespace clang;
29
using namespace CodeGen;
30
31
namespace {
32
constexpr unsigned CudaFatMagic = 0x466243b1;
33
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
34
35
class CGNVCUDARuntime : public CGCUDARuntime {
36
37
private:
38
  llvm::IntegerType *IntTy, *SizeTy;
39
  llvm::Type *VoidTy;
40
  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
41
42
  /// Convenience reference to LLVM Context
43
  llvm::LLVMContext &Context;
44
  /// Convenience reference to the current module
45
  llvm::Module &TheModule;
46
  /// Keeps track of kernel launch stubs and handles emitted in this module
47
  struct KernelInfo {
48
    llvm::Function *Kernel; // stub function to help launch kernel
49
    const Decl *D;
50
  };
51
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
52
  // Map a device stub function to a symbol for identifying kernel in host code.
53
  // For CUDA, the symbol for identifying the kernel is the same as the device
54
  // stub function. For HIP, they are different.
55
  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
56
  // Map a kernel handle to the kernel stub.
57
  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
58
  struct VarInfo {
59
    llvm::GlobalVariable *Var;
60
    const VarDecl *D;
61
    DeviceVarFlags Flags;
62
  };
63
  llvm::SmallVector<VarInfo, 16> DeviceVars;
64
  /// Keeps track of variable containing handle of GPU binary. Populated by
65
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
66
  /// ModuleDtorFunction()
67
  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
68
  /// Whether we generate relocatable device code.
69
  bool RelocatableDeviceCode;
70
  /// Mangle context for device.
71
  std::unique_ptr<MangleContext> DeviceMC;
72
73
  llvm::FunctionCallee getSetupArgumentFn() const;
74
  llvm::FunctionCallee getLaunchFn() const;
75
76
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
77
  llvm::FunctionType *getCallbackFnTy() const;
78
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
79
  std::string addPrefixToName(StringRef FuncName) const;
80
  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
81
82
  /// Creates a function to register all kernel stubs generated in this module.
83
  llvm::Function *makeRegisterGlobalsFn();
84
85
  /// Helper function that generates a constant string and returns a pointer to
86
  /// the start of the string.  The result of this function can be used anywhere
87
  /// where the C code specifies const char*.
88
  llvm::Constant *makeConstantString(const std::string &Str,
89
                                     const std::string &Name = "",
90
                                     const std::string &SectionName = "",
91
130
                                     unsigned Alignment = 0) {
92
130
    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
93
130
                               llvm::ConstantInt::get(SizeTy, 0)};
94
130
    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95
130
    llvm::GlobalVariable *GV =
96
130
        cast<llvm::GlobalVariable>(ConstStr.getPointer());
97
130
    if (!SectionName.empty()) {
98
22
      GV->setSection(SectionName);
99
      // Mark the address as used which make sure that this section isn't
100
      // merged and we will really have it in the object file.
101
22
      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
102
22
    }
103
130
    if (Alignment)
104
22
      GV->setAlignment(llvm::Align(Alignment));
105
106
130
    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
107
130
                                                ConstStr.getPointer(), Zeros);
108
130
  }
109
110
  /// Helper function that generates an empty dummy function returning void.
111
3
  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
112
3
    assert(FnTy->getReturnType()->isVoidTy() &&
113
3
           "Can only generate dummy functions returning void!");
114
0
    llvm::Function *DummyFunc = llvm::Function::Create(
115
3
        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
116
117
3
    llvm::BasicBlock *DummyBlock =
118
3
        llvm::BasicBlock::Create(Context, "", DummyFunc);
119
3
    CGBuilderTy FuncBuilder(CGM, Context);
120
3
    FuncBuilder.SetInsertPoint(DummyBlock);
121
3
    FuncBuilder.CreateRetVoid();
122
123
3
    return DummyFunc;
124
3
  }
125
126
  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
127
  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
128
  std::string getDeviceSideName(const NamedDecl *ND) override;
129
130
  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
131
142
                         bool Extern, bool Constant) {
132
142
    DeviceVars.push_back({&Var,
133
142
                          VD,
134
142
                          {DeviceVarFlags::Variable, Extern, Constant,
135
142
                           VD->hasAttr<HIPManagedAttr>(),
136
142
                           /*Normalized*/ false, 0}});
137
142
  }
138
  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
139
0
                          bool Extern, int Type) {
140
0
    DeviceVars.push_back({&Var,
141
0
                          VD,
142
0
                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
143
0
                           /*Managed*/ false,
144
0
                           /*Normalized*/ false, Type}});
145
0
  }
146
  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
147
0
                         bool Extern, int Type, bool Normalized) {
148
0
    DeviceVars.push_back({&Var,
149
0
                          VD,
150
0
                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
151
0
                           /*Managed*/ false, Normalized, Type}});
152
0
  }
153
154
  /// Creates module constructor function
155
  llvm::Function *makeModuleCtorFunction();
156
  /// Creates module destructor function
157
  llvm::Function *makeModuleDtorFunction();
158
  /// Transform managed variables for device compilation.
159
  void transformManagedVars();
160
161
public:
162
  CGNVCUDARuntime(CodeGenModule &CGM);
163
164
  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
165
49
  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
166
49
    auto Loc = KernelStubs.find(Handle);
167
49
    assert(Loc != KernelStubs.end());
168
0
    return Loc->second;
169
49
  }
170
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
171
  void handleVarRegistration(const VarDecl *VD,
172
                             llvm::GlobalVariable &Var) override;
173
  void
174
  internalizeDeviceSideVar(const VarDecl *D,
175
                           llvm::GlobalValue::LinkageTypes &Linkage) override;
176
177
  llvm::Function *finalizeModule() override;
178
};
179
180
} // end anonymous namespace
181
182
55
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
183
55
  if (CGM.getLangOpts().HIP)
184
35
    return ((Twine("hip") + Twine(FuncName)).str());
185
20
  return ((Twine("cuda") + Twine(FuncName)).str());
186
55
}
187
std::string
188
364
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
189
364
  if (CGM.getLangOpts().HIP)
190
265
    return ((Twine("__hip") + Twine(FuncName)).str());
191
99
  return ((Twine("__cuda") + Twine(FuncName)).str());
192
364
}
193
194
135
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
195
  // If the host and device have different C++ ABIs, mark it as the device
196
  // mangle context so that the mangling needs to retrieve the additional
197
  // device lambda mangling number instead of the regular host one.
198
135
  if (CGM.getContext().getAuxTargetInfo() &&
199
135
      
CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft()13
&&
200
135
      
CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()6
) {
201
6
    return std::unique_ptr<MangleContext>(
202
6
        CGM.getContext().createDeviceMangleContext(
203
6
            *CGM.getContext().getAuxTargetInfo()));
204
6
  }
205
206
129
  return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
207
129
      CGM.getContext().getAuxTargetInfo()));
208
135
}
209
210
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
211
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
212
      TheModule(CGM.getModule()),
213
      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
214
135
      DeviceMC(InitDeviceMC(CGM)) {
215
135
  CodeGen::CodeGenTypes &Types = CGM.getTypes();
216
135
  ASTContext &Ctx = CGM.getContext();
217
218
135
  IntTy = CGM.IntTy;
219
135
  SizeTy = CGM.SizeTy;
220
135
  VoidTy = CGM.VoidTy;
221
222
135
  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
223
135
  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
224
135
  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
225
135
}
226
227
48
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
228
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
229
48
  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
230
48
  return CGM.CreateRuntimeFunction(
231
48
      llvm::FunctionType::get(IntTy, Params, false),
232
48
      addPrefixToName("SetupArgument"));
233
48
}
234
235
48
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
236
48
  if (CGM.getLangOpts().HIP) {
237
    // hipError_t hipLaunchByPtr(char *);
238
34
    return CGM.CreateRuntimeFunction(
239
34
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
240
34
  }
241
  // cudaError_t cudaLaunch(char *);
242
14
  return CGM.CreateRuntimeFunction(
243
14
      llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
244
48
}
245
246
36
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
247
36
  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
248
36
}
249
250
6
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
251
6
  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
252
6
}
253
254
3
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
255
3
  auto *CallbackFnTy = getCallbackFnTy();
256
3
  auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
257
3
  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
258
3
                          VoidPtrTy, CallbackFnTy->getPointerTo()};
259
3
  return llvm::FunctionType::get(VoidTy, Params, false);
260
3
}
261
262
186
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
263
186
  GlobalDecl GD;
264
  // D could be either a kernel or a variable.
265
186
  if (auto *FD = dyn_cast<FunctionDecl>(ND))
266
120
    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
267
66
  else
268
66
    GD = GlobalDecl(ND);
269
186
  std::string DeviceSideName;
270
186
  MangleContext *MC;
271
186
  if (CGM.getLangOpts().CUDAIsDevice)
272
0
    MC = &CGM.getCXXABI().getMangleContext();
273
186
  else
274
186
    MC = DeviceMC.get();
275
186
  if (MC->shouldMangleDeclName(ND)) {
276
113
    SmallString<256> Buffer;
277
113
    llvm::raw_svector_ostream Out(Buffer);
278
113
    MC->mangleName(GD, Out);
279
113
    DeviceSideName = std::string(Out.str());
280
113
  } else
281
73
    DeviceSideName = std::string(ND->getIdentifier()->getName());
282
283
  // Make unique name for device side static file-scope variable for HIP.
284
186
  if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
285
186
      
CGM.getLangOpts().GPURelocatableDeviceCode6
&&
286
186
      
!CGM.getLangOpts().CUID.empty()3
) {
287
3
    SmallString<256> Buffer;
288
3
    llvm::raw_svector_ostream Out(Buffer);
289
3
    Out << DeviceSideName;
290
3
    CGM.printPostfixForExternalizedStaticVar(Out);
291
3
    DeviceSideName = std::string(Out.str());
292
3
  }
293
186
  return DeviceSideName;
294
186
}
295
296
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
297
55
                                     FunctionArgList &Args) {
298
55
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
299
55
  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
300
35
    GV->setLinkage(CGF.CurFn->getLinkage());
301
35
    GV->setInitializer(CGF.CurFn);
302
35
  }
303
55
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
304
55
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
305
55
      
(49
CGF.getLangOpts().HIP49
&&
CGF.getLangOpts().HIPUseNewLaunchAPI35
))
306
7
    emitDeviceStubBodyNew(CGF, Args);
307
48
  else
308
48
    emitDeviceStubBodyLegacy(CGF, Args);
309
55
}
310
311
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
312
// array and kernels are launched using cudaLaunchKernel().
313
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
314
7
                                            FunctionArgList &Args) {
315
  // Build the shadow stack entry at the very start of the function.
316
317
  // Calculate amount of space we will need for all arguments.  If we have no
318
  // args, allocate a single pointer so we still have a valid pointer to the
319
  // argument array that we can pass to runtime, even if it will be unused.
320
7
  Address KernelArgs = CGF.CreateTempAlloca(
321
7
      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
322
7
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
323
  // Store pointers to the arguments in a locally allocated launch_args.
324
24
  for (unsigned i = 0; i < Args.size(); 
++i17
) {
325
17
    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
326
17
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
327
17
    CGF.Builder.CreateDefaultAlignedStore(
328
17
        VoidVarPtr,
329
17
        CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
330
17
  }
331
332
7
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
333
334
  // Lookup cudaLaunchKernel/hipLaunchKernel function.
335
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
336
  //                              void **args, size_t sharedMem,
337
  //                              cudaStream_t stream);
338
  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
339
  //                            void **args, size_t sharedMem,
340
  //                            hipStream_t stream);
341
7
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
342
7
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
343
7
  auto LaunchKernelName = addPrefixToName("LaunchKernel");
344
7
  IdentifierInfo &cudaLaunchKernelII =
345
7
      CGM.getContext().Idents.get(LaunchKernelName);
346
7
  FunctionDecl *cudaLaunchKernelFD = nullptr;
347
7
  for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
348
7
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
349
7
      cudaLaunchKernelFD = FD;
350
7
  }
351
352
7
  if (cudaLaunchKernelFD == nullptr) {
353
0
    CGM.Error(CGF.CurFuncDecl->getLocation(),
354
0
              "Can't find declaration for " + LaunchKernelName);
355
0
    return;
356
0
  }
357
  // Create temporary dim3 grid_dim, block_dim.
358
7
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
359
7
  QualType Dim3Ty = GridDimParam->getType();
360
7
  Address GridDim =
361
7
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
362
7
  Address BlockDim =
363
7
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
364
7
  Address ShmemSize =
365
7
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
366
7
  Address Stream =
367
7
      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
368
7
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
369
7
      llvm::FunctionType::get(IntTy,
370
7
                              {/*gridDim=*/GridDim.getType(),
371
7
                               /*blockDim=*/BlockDim.getType(),
372
7
                               /*ShmemSize=*/ShmemSize.getType(),
373
7
                               /*Stream=*/Stream.getType()},
374
7
                              /*isVarArg=*/false),
375
7
      addUnderscoredPrefixToName("PopCallConfiguration"));
376
377
7
  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
378
7
                              {GridDim.getPointer(), BlockDim.getPointer(),
379
7
                               ShmemSize.getPointer(), Stream.getPointer()});
380
381
  // Emit the call to cudaLaunch
382
7
  llvm::Value *Kernel =
383
7
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
384
7
  CallArgList LaunchKernelArgs;
385
7
  LaunchKernelArgs.add(RValue::get(Kernel),
386
7
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
387
7
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
388
7
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
389
7
  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
390
7
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
391
7
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
392
7
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
393
7
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
394
7
                       cudaLaunchKernelFD->getParamDecl(5)->getType());
395
396
7
  QualType QT = cudaLaunchKernelFD->getType();
397
7
  QualType CQT = QT.getCanonicalType();
398
7
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
399
7
  llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
400
401
7
  const CGFunctionInfo &FI =
402
7
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
403
7
  llvm::FunctionCallee cudaLaunchKernelFn =
404
7
      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
405
7
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
406
7
               LaunchKernelArgs);
407
7
  CGF.EmitBranch(EndBlock);
408
409
7
  CGF.EmitBlock(EndBlock);
410
7
}
411
412
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
413
48
                                               FunctionArgList &Args) {
414
  // Emit a call to cudaSetupArgument for each arg in Args.
415
48
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
416
48
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
417
48
  CharUnits Offset = CharUnits::Zero();
418
53
  for (const VarDecl *A : Args) {
419
53
    auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
420
53
    Offset = Offset.alignTo(TInfo.Align);
421
53
    llvm::Value *Args[] = {
422
53
        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
423
53
                                      VoidPtrTy),
424
53
        llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
425
53
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
426
53
    };
427
53
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
428
53
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
429
53
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
430
53
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
431
53
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
432
53
    CGF.EmitBlock(NextBlock);
433
53
    Offset += TInfo.Width;
434
53
  }
435
436
  // Emit the call to cudaLaunch
437
48
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
438
48
  llvm::Value *Arg =
439
48
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
440
48
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
441
48
  CGF.EmitBranch(EndBlock);
442
443
48
  CGF.EmitBlock(EndBlock);
444
48
}
445
446
// Replace the original variable Var with the address loaded from variable
447
// ManagedVar populated by HIP runtime.
448
static void replaceManagedVar(llvm::GlobalVariable *Var,
449
12
                              llvm::GlobalVariable *ManagedVar) {
450
12
  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
451
12
  for (auto &&VarUse : Var->uses()) {
452
10
    WorkList.push_back({VarUse.getUser()});
453
10
  }
454
26
  while (!WorkList.empty()) {
455
14
    auto &&WorkItem = WorkList.pop_back_val();
456
14
    auto *U = WorkItem.back();
457
14
    if (isa<llvm::ConstantExpr>(U)) {
458
6
      for (auto &&UU : U->uses()) {
459
4
        WorkItem.push_back(UU.getUser());
460
4
        WorkList.push_back(WorkItem);
461
4
        WorkItem.pop_back();
462
4
      }
463
6
      continue;
464
6
    }
465
8
    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
466
8
      llvm::Value *OldV = Var;
467
8
      llvm::Instruction *NewV =
468
8
          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
469
8
                             llvm::Align(Var->getAlignment()), I);
470
8
      WorkItem.pop_back();
471
      // Replace constant expressions directly or indirectly using the managed
472
      // variable with instructions.
473
8
      for (auto &&Op : WorkItem) {
474
4
        auto *CE = cast<llvm::ConstantExpr>(Op);
475
4
        auto *NewInst = CE->getAsInstruction(I);
476
4
        NewInst->replaceUsesOfWith(OldV, NewV);
477
4
        OldV = CE;
478
4
        NewV = NewInst;
479
4
      }
480
8
      I->replaceUsesOfWith(OldV, NewV);
481
8
    } else {
482
0
      llvm_unreachable("Invalid use of managed variable");
483
0
    }
484
8
  }
485
12
}
486
487
/// Creates a function that sets up state on the host side for CUDA objects that
488
/// have a presence on both the host and device sides. Specifically, registers
489
/// the host side of kernel functions and device global variables with the CUDA
490
/// runtime.
491
/// \code
492
/// void __cuda_register_globals(void** GpuBinaryHandle) {
493
///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
494
///    ...
495
///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
496
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
497
///    ...
498
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
499
/// }
500
/// \endcode
501
33
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
502
  // No need to register anything
503
33
  if (EmittedKernels.empty() && 
DeviceVars.empty()4
)
504
0
    return nullptr;
505
506
33
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
507
33
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
508
33
      addUnderscoredPrefixToName("_register_globals"), &TheModule);
509
33
  llvm::BasicBlock *EntryBB =
510
33
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
511
33
  CGBuilderTy Builder(CGM, Context);
512
33
  Builder.SetInsertPoint(EntryBB);
513
514
  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
515
  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
516
33
  llvm::Type *RegisterFuncParams[] = {
517
33
      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
518
33
      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
519
33
  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
520
33
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
521
33
      addUnderscoredPrefixToName("RegisterFunction"));
522
523
  // Extract GpuBinaryHandle passed as the first argument passed to
524
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
525
  // each emitted kernel.
526
33
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
527
44
  for (auto &&I : EmittedKernels) {
528
44
    llvm::Constant *KernelName =
529
44
        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
530
44
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
531
44
    llvm::Value *Args[] = {
532
44
        &GpuBinaryHandlePtr,
533
44
        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
534
44
        KernelName,
535
44
        KernelName,
536
44
        llvm::ConstantInt::get(IntTy, -1),
537
44
        NullPtr,
538
44
        NullPtr,
539
44
        NullPtr,
540
44
        NullPtr,
541
44
        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
542
44
    Builder.CreateCall(RegisterFunc, Args);
543
44
  }
544
545
33
  llvm::Type *VarSizeTy = IntTy;
546
  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
547
33
  if (CGM.getLangOpts().HIP ||
548
33
      
ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_909
)
549
28
    VarSizeTy = SizeTy;
550
551
  // void __cudaRegisterVar(void **, char *, char *, const char *,
552
  //                        int, int, int, int)
553
33
  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
554
33
                                     CharPtrTy,    IntTy,     VarSizeTy,
555
33
                                     IntTy,        IntTy};
556
33
  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
557
33
      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
558
33
      addUnderscoredPrefixToName("RegisterVar"));
559
  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
560
  //                              size_t, unsigned)
561
33
  llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
562
33
                                            CharPtrTy,    VarSizeTy, IntTy};
563
33
  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
564
33
      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
565
33
      addUnderscoredPrefixToName("RegisterManagedVar"));
566
  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
567
  //                            const void **, const char *, int, int);
568
33
  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
569
33
      llvm::FunctionType::get(
570
33
          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
571
33
          false),
572
33
      addUnderscoredPrefixToName("RegisterSurface"));
573
  // void __cudaRegisterTexture(void **, const struct textureReference *,
574
  //                            const void **, const char *, int, int, int)
575
33
  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
576
33
      llvm::FunctionType::get(
577
33
          VoidTy,
578
33
          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
579
33
          false),
580
33
      addUnderscoredPrefixToName("RegisterTexture"));
581
64
  for (auto &&Info : DeviceVars) {
582
64
    llvm::GlobalVariable *Var = Info.Var;
583
64
    assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
584
64
           "External variables should not show up here, except HIP managed "
585
64
           "variables");
586
0
    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
587
64
    switch (Info.Flags.getKind()) {
588
64
    case DeviceVarFlags::Variable: {
589
64
      uint64_t VarSize =
590
64
          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
591
64
      if (Info.Flags.isManaged()) {
592
6
        auto *ManagedVar = new llvm::GlobalVariable(
593
6
            CGM.getModule(), Var->getType(),
594
6
            /*isConstant=*/false, Var->getLinkage(),
595
6
            /*Init=*/Var->isDeclaration()
596
6
                ? 
nullptr2
597
6
                : 
llvm::ConstantPointerNull::get(Var->getType())4
,
598
6
            /*Name=*/"", /*InsertBefore=*/nullptr,
599
6
            llvm::GlobalVariable::NotThreadLocal);
600
6
        ManagedVar->setDSOLocal(Var->isDSOLocal());
601
6
        ManagedVar->setVisibility(Var->getVisibility());
602
6
        ManagedVar->setExternallyInitialized(true);
603
6
        ManagedVar->takeName(Var);
604
6
        Var->setName(Twine(ManagedVar->getName() + ".managed"));
605
6
        replaceManagedVar(Var, ManagedVar);
606
6
        llvm::Value *Args[] = {
607
6
            &GpuBinaryHandlePtr,
608
6
            Builder.CreateBitCast(ManagedVar, VoidPtrTy),
609
6
            Builder.CreateBitCast(Var, VoidPtrTy),
610
6
            VarName,
611
6
            llvm::ConstantInt::get(VarSizeTy, VarSize),
612
6
            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
613
6
        if (!Var->isDeclaration())
614
4
          Builder.CreateCall(RegisterManagedVar, Args);
615
58
      } else {
616
58
        llvm::Value *Args[] = {
617
58
            &GpuBinaryHandlePtr,
618
58
            Builder.CreateBitCast(Var, VoidPtrTy),
619
58
            VarName,
620
58
            VarName,
621
58
            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
622
58
            llvm::ConstantInt::get(VarSizeTy, VarSize),
623
58
            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
624
58
            llvm::ConstantInt::get(IntTy, 0)};
625
58
        Builder.CreateCall(RegisterVar, Args);
626
58
      }
627
64
      break;
628
0
    }
629
0
    case DeviceVarFlags::Surface:
630
0
      Builder.CreateCall(
631
0
          RegisterSurf,
632
0
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
633
0
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
634
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
635
0
      break;
636
0
    case DeviceVarFlags::Texture:
637
0
      Builder.CreateCall(
638
0
          RegisterTex,
639
0
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
640
0
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
641
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
642
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
643
0
      break;
644
64
    }
645
64
  }
646
647
33
  Builder.CreateRetVoid();
648
33
  return RegisterKernelsFunc;
649
33
}
650
651
/// Creates a global constructor function for the module:
652
///
653
/// For CUDA:
654
/// \code
655
/// void __cuda_module_ctor(void*) {
656
///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
657
///     __cuda_register_globals(Handle);
658
/// }
659
/// \endcode
660
///
661
/// For HIP:
662
/// \code
663
/// void __hip_module_ctor(void*) {
664
///     if (__hip_gpubin_handle == 0) {
665
///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
666
///         __hip_register_globals(__hip_gpubin_handle);
667
///     }
668
/// }
669
/// \endcode
670
50
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
671
50
  bool IsHIP = CGM.getLangOpts().HIP;
672
50
  bool IsCUDA = CGM.getLangOpts().CUDA;
673
  // No need to generate ctors/dtors if there is no GPU binary.
674
50
  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
675
50
  if (CudaGpuBinaryFileName.empty() && 
!IsHIP28
)
676
12
    return nullptr;
677
38
  if ((IsHIP || 
(11
IsCUDA11
&&
!RelocatableDeviceCode11
)) &&
EmittedKernels.empty()35
&&
678
38
      
DeviceVars.empty()9
)
679
5
    return nullptr;
680
681
  // void __{cuda|hip}_register_globals(void* handle);
682
33
  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
683
  // We always need a function to pass in as callback. Create a dummy
684
  // implementation if we don't need to register anything.
685
33
  if (RelocatableDeviceCode && 
!RegisterGlobalsFunc5
)
686
0
    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
687
688
  // void ** __{cuda|hip}RegisterFatBinary(void *);
689
33
  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
690
33
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
691
33
      addUnderscoredPrefixToName("RegisterFatBinary"));
692
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
693
33
  llvm::StructType *FatbinWrapperTy =
694
33
      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
695
696
  // Register GPU binary with the CUDA runtime, store returned handle in a
697
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
698
  // in destructor on exit. Then associate all known kernels with the GPU binary
699
  // handle so CUDA runtime can figure out what to call on the GPU side.
700
33
  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
701
33
  if (!CudaGpuBinaryFileName.empty()) {
702
19
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
703
19
        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
704
19
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
705
0
      CGM.getDiags().Report(diag::err_cannot_open_file)
706
0
          << CudaGpuBinaryFileName << EC.message();
707
0
      return nullptr;
708
0
    }
709
19
    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
710
19
  }
711
712
33
  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
713
33
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
714
33
      llvm::GlobalValue::InternalLinkage,
715
33
      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
716
33
  llvm::BasicBlock *CtorEntryBB =
717
33
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
718
33
  CGBuilderTy CtorBuilder(CGM, Context);
719
720
33
  CtorBuilder.SetInsertPoint(CtorEntryBB);
721
722
33
  const char *FatbinConstantName;
723
33
  const char *FatbinSectionName;
724
33
  const char *ModuleIDSectionName;
725
33
  StringRef ModuleIDPrefix;
726
33
  llvm::Constant *FatBinStr;
727
33
  unsigned FatMagic;
728
33
  if (IsHIP) {
729
24
    FatbinConstantName = ".hip_fatbin";
730
24
    FatbinSectionName = ".hipFatBinSegment";
731
732
24
    ModuleIDSectionName = "__hip_module_id";
733
24
    ModuleIDPrefix = "__hip_";
734
735
24
    if (CudaGpuBinary) {
736
      // If fatbin is available from early finalization, create a string
737
      // literal containing the fat binary loaded from the given file.
738
10
      const unsigned HIPCodeObjectAlign = 4096;
739
10
      FatBinStr =
740
10
          makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
741
10
                             FatbinConstantName, HIPCodeObjectAlign);
742
14
    } else {
743
      // If fatbin is not available, create an external symbol
744
      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
745
      // to contain the fat binary but will be populated somewhere else,
746
      // e.g. by lld through link script.
747
14
      FatBinStr = new llvm::GlobalVariable(
748
14
        CGM.getModule(), CGM.Int8Ty,
749
14
        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
750
14
        "__hip_fatbin", nullptr,
751
14
        llvm::GlobalVariable::NotThreadLocal);
752
14
      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
753
14
    }
754
755
24
    FatMagic = HIPFatMagic;
756
24
  } else {
757
9
    if (RelocatableDeviceCode)
758
3
      FatbinConstantName = CGM.getTriple().isMacOSX()
759
3
                               ? 
"__NV_CUDA,__nv_relfatbin"0
760
3
                               : "__nv_relfatbin";
761
6
    else
762
6
      FatbinConstantName =
763
6
          CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__nv_fatbin"0
: ".nv_fatbin";
764
    // NVIDIA's cuobjdump looks for fatbins in this section.
765
9
    FatbinSectionName =
766
9
        CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__fatbin"0
: ".nvFatBinSegment";
767
768
9
    ModuleIDSectionName = CGM.getTriple().isMacOSX()
769
9
                              ? 
"__NV_CUDA,__nv_module_id"0
770
9
                              : "__nv_module_id";
771
9
    ModuleIDPrefix = "__nv_";
772
773
    // For CUDA, create a string literal containing the fat binary loaded from
774
    // the given file.
775
9
    FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
776
9
                                   FatbinConstantName, 8);
777
9
    FatMagic = CudaFatMagic;
778
9
  }
779
780
  // Create initialized wrapper structure that points to the loaded GPU binary
781
33
  ConstantInitBuilder Builder(CGM);
782
33
  auto Values = Builder.beginStruct(FatbinWrapperTy);
783
  // Fatbin wrapper magic.
784
33
  Values.addInt(IntTy, FatMagic);
785
  // Fatbin version.
786
33
  Values.addInt(IntTy, 1);
787
  // Data.
788
33
  Values.add(FatBinStr);
789
  // Unused in fatbin v1.
790
33
  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
791
33
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
792
33
      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
793
33
      /*constant*/ true);
794
33
  FatbinWrapper->setSection(FatbinSectionName);
795
796
  // There is only one HIP fat binary per linked module, however there are
797
  // multiple constructor functions. Make sure the fat binary is registered
798
  // only once. The constructor functions are executed by the dynamic loader
799
  // before the program gains control. The dynamic loader cannot execute the
800
  // constructor functions concurrently since doing that would not guarantee
801
  // thread safety of the loaded program. Therefore we can assume sequential
802
  // execution of constructor functions here.
803
33
  if (IsHIP) {
804
24
    auto Linkage = CudaGpuBinary ? 
llvm::GlobalValue::InternalLinkage10
:
805
24
        
llvm::GlobalValue::LinkOnceAnyLinkage14
;
806
24
    llvm::BasicBlock *IfBlock =
807
24
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
808
24
    llvm::BasicBlock *ExitBlock =
809
24
        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
810
    // The name, size, and initialization pattern of this variable is part
811
    // of HIP ABI.
812
24
    GpuBinaryHandle = new llvm::GlobalVariable(
813
24
        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
814
24
        Linkage,
815
24
        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
816
24
        "__hip_gpubin_handle");
817
24
    if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
818
14
      GpuBinaryHandle->setComdat(
819
14
          CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
820
24
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
821
    // Prevent the weak symbol in different shared libraries being merged.
822
24
    if (Linkage != llvm::GlobalValue::InternalLinkage)
823
14
      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
824
24
    Address GpuBinaryAddr(
825
24
        GpuBinaryHandle,
826
24
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
827
24
    {
828
24
      auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
829
24
      llvm::Constant *Zero =
830
24
          llvm::Constant::getNullValue(HandleValue->getType());
831
24
      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
832
24
      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
833
24
    }
834
24
    {
835
24
      CtorBuilder.SetInsertPoint(IfBlock);
836
      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
837
24
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
838
24
          RegisterFatbinFunc,
839
24
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
840
24
      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
841
24
      CtorBuilder.CreateBr(ExitBlock);
842
24
    }
843
24
    {
844
24
      CtorBuilder.SetInsertPoint(ExitBlock);
845
      // Call __hip_register_globals(GpuBinaryHandle);
846
24
      if (RegisterGlobalsFunc) {
847
24
        auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
848
24
        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
849
24
      }
850
24
    }
851
24
  } else 
if (9
!RelocatableDeviceCode9
) {
852
    // Register binary with CUDA runtime. This is substantially different in
853
    // default mode vs. separate compilation!
854
    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
855
6
    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
856
6
        RegisterFatbinFunc,
857
6
        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
858
6
    GpuBinaryHandle = new llvm::GlobalVariable(
859
6
        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
860
6
        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
861
6
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
862
6
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
863
6
                                   CGM.getPointerAlign());
864
865
    // Call __cuda_register_globals(GpuBinaryHandle);
866
6
    if (RegisterGlobalsFunc)
867
6
      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
868
869
    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
870
6
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
871
6
                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
872
      // void __cudaRegisterFatBinaryEnd(void **);
873
0
      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
874
0
          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
875
0
          "__cudaRegisterFatBinaryEnd");
876
0
      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
877
0
    }
878
6
  } else {
879
    // Generate a unique module ID.
880
3
    SmallString<64> ModuleID;
881
3
    llvm::raw_svector_ostream OS(ModuleID);
882
3
    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
883
3
    llvm::Constant *ModuleIDConstant = makeConstantString(
884
3
        std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
885
886
    // Create an alias for the FatbinWrapper that nvcc will look for.
887
3
    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
888
3
                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
889
890
    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
891
    // void *, void (*)(void **))
892
3
    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
893
3
    RegisterLinkedBinaryName += ModuleID;
894
3
    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
895
3
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
896
897
3
    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
898
0
    llvm::Value *Args[] = {RegisterGlobalsFunc,
899
3
                           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
900
3
                           ModuleIDConstant,
901
3
                           makeDummyFunction(getCallbackFnTy())};
902
3
    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
903
3
  }
904
905
  // Create destructor and register it with atexit() the way NVCC does it. Doing
906
  // it during regular destructor phase worked in CUDA before 9.2 but results in
907
  // double-free in 9.2.
908
33
  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
909
    // extern "C" int atexit(void (*f)(void));
910
30
    llvm::FunctionType *AtExitTy =
911
30
        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
912
30
    llvm::FunctionCallee AtExitFunc =
913
30
        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
914
30
                                  /*Local=*/true);
915
30
    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
916
30
  }
917
918
33
  CtorBuilder.CreateRetVoid();
919
33
  return ModuleCtorFunc;
920
33
}
921
922
/// Creates a global destructor function that unregisters the GPU code blob
923
/// registered by constructor.
924
///
925
/// For CUDA:
926
/// \code
927
/// void __cuda_module_dtor(void*) {
928
///     __cudaUnregisterFatBinary(Handle);
929
/// }
930
/// \endcode
931
///
932
/// For HIP:
933
/// \code
934
/// void __hip_module_dtor(void*) {
935
///     if (__hip_gpubin_handle) {
936
///         __hipUnregisterFatBinary(__hip_gpubin_handle);
937
///         __hip_gpubin_handle = 0;
938
///     }
939
/// }
940
/// \endcode
941
33
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
942
  // No need for destructor if we don't have a handle to unregister.
943
33
  if (!GpuBinaryHandle)
944
3
    return nullptr;
945
946
  // void __cudaUnregisterFatBinary(void ** handle);
947
30
  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
948
30
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
949
30
      addUnderscoredPrefixToName("UnregisterFatBinary"));
950
951
30
  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
952
30
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
953
30
      llvm::GlobalValue::InternalLinkage,
954
30
      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
955
956
30
  llvm::BasicBlock *DtorEntryBB =
957
30
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
958
30
  CGBuilderTy DtorBuilder(CGM, Context);
959
30
  DtorBuilder.SetInsertPoint(DtorEntryBB);
960
961
30
  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
962
30
                                             GpuBinaryHandle->getAlignment()));
963
30
  auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
964
  // There is only one HIP fat binary per linked module, however there are
965
  // multiple destructor functions. Make sure the fat binary is unregistered
966
  // only once.
967
30
  if (CGM.getLangOpts().HIP) {
968
24
    llvm::BasicBlock *IfBlock =
969
24
        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
970
24
    llvm::BasicBlock *ExitBlock =
971
24
        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
972
24
    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
973
24
    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
974
24
    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
975
976
24
    DtorBuilder.SetInsertPoint(IfBlock);
977
24
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
978
24
    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
979
24
    DtorBuilder.CreateBr(ExitBlock);
980
981
24
    DtorBuilder.SetInsertPoint(ExitBlock);
982
24
  } else {
983
6
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
984
6
  }
985
30
  DtorBuilder.CreateRetVoid();
986
30
  return ModuleDtorFunc;
987
33
}
988
989
135
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
990
135
  return new CGNVCUDARuntime(CGM);
991
135
}
992
993
void CGNVCUDARuntime::internalizeDeviceSideVar(
994
109
    const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
995
  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
996
  // global variables become internal definitions. These have to be internal in
997
  // order to prevent name conflicts with global host variables with the same
998
  // name in a different TUs.
999
  //
1000
  // For -fgpu-rdc, the shadow variables should not be internalized because
1001
  // they may be accessed by different TU.
1002
109
  if (CGM.getLangOpts().GPURelocatableDeviceCode)
1003
32
    return;
1004
1005
  // __shared__ variables are odd. Shadows do get created, but
1006
  // they are not registered with the CUDA runtime, so they
1007
  // can't really be used to access their device-side
1008
  // counterparts. It's not clear yet whether it's nvcc's bug or
1009
  // a feature, but we've got to do the same for compatibility.
1010
77
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()47
||
1011
77
      
D->hasAttr<CUDASharedAttr>()26
||
1012
77
      
D->getType()->isCUDADeviceBuiltinSurfaceType()16
||
1013
77
      
D->getType()->isCUDADeviceBuiltinTextureType()16
) {
1014
61
    Linkage = llvm::GlobalValue::InternalLinkage;
1015
61
  }
1016
77
}
1017
1018
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1019
188
                                            llvm::GlobalVariable &GV) {
1020
188
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()117
) {
1021
    // Shadow variables and their properties must be registered with CUDA
1022
    // runtime. Skip Extern global variables, which will be registered in
1023
    // the TU where they are defined.
1024
    //
1025
    // Don't register a C++17 inline variable. The local symbol can be
1026
    // discarded and referencing a discarded local symbol from outside the
1027
    // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1028
    //
1029
    // HIP managed variables need to be always recorded in device and host
1030
    // compilations for transformation.
1031
    //
1032
    // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1033
    // added to llvm.compiler-used, therefore they are safe to be registered.
1034
148
    if ((!D->hasExternalStorage() && 
!D->isInline()144
) ||
1035
148
        
CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D)12
||
1036
148
        
D->hasAttr<HIPManagedAttr>()10
) {
1037
142
      registerDeviceVar(D, GV, !D->hasDefinition(),
1038
142
                        D->hasAttr<CUDAConstantAttr>());
1039
142
    }
1040
148
  } else 
if (40
D->getType()->isCUDADeviceBuiltinSurfaceType()40
||
1041
40
             D->getType()->isCUDADeviceBuiltinTextureType()) {
1042
    // Builtin surfaces and textures and their template arguments are
1043
    // also registered with CUDA runtime.
1044
0
    const auto *TD = cast<ClassTemplateSpecializationDecl>(
1045
0
        D->getType()->castAs<RecordType>()->getDecl());
1046
0
    const TemplateArgumentList &Args = TD->getTemplateArgs();
1047
0
    if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1048
0
      assert(Args.size() == 2 &&
1049
0
             "Unexpected number of template arguments of CUDA device "
1050
0
             "builtin surface type.");
1051
0
      auto SurfType = Args[1].getAsIntegral();
1052
0
      if (!D->hasExternalStorage())
1053
0
        registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1054
0
    } else {
1055
0
      assert(Args.size() == 3 &&
1056
0
             "Unexpected number of template arguments of CUDA device "
1057
0
             "builtin texture type.");
1058
0
      auto TexType = Args[1].getAsIntegral();
1059
0
      auto Normalized = Args[2].getAsIntegral();
1060
0
      if (!D->hasExternalStorage())
1061
0
        registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1062
0
                          Normalized.getZExtValue());
1063
0
    }
1064
0
  }
1065
188
}
1066
1067
// Transform managed variables to pointers to managed variables in device code.
1068
// Each use of the original managed variable is replaced by a load from the
1069
// transformed managed variable. The transformed managed variable contains
1070
// the address of managed memory which will be allocated by the runtime.
1071
71
void CGNVCUDARuntime::transformManagedVars() {
1072
71
  for (auto &&Info : DeviceVars) {
1073
67
    llvm::GlobalVariable *Var = Info.Var;
1074
67
    if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1075
67
        Info.Flags.isManaged()) {
1076
6
      auto *ManagedVar = new llvm::GlobalVariable(
1077
6
          CGM.getModule(), Var->getType(),
1078
6
          /*isConstant=*/false, Var->getLinkage(),
1079
6
          /*Init=*/Var->isDeclaration()
1080
6
              ? 
nullptr2
1081
6
              : 
llvm::ConstantPointerNull::get(Var->getType())4
,
1082
6
          /*Name=*/"", /*InsertBefore=*/nullptr,
1083
6
          llvm::GlobalVariable::NotThreadLocal,
1084
6
          CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1085
6
      ManagedVar->setDSOLocal(Var->isDSOLocal());
1086
6
      ManagedVar->setVisibility(Var->getVisibility());
1087
6
      ManagedVar->setExternallyInitialized(true);
1088
6
      replaceManagedVar(Var, ManagedVar);
1089
6
      ManagedVar->takeName(Var);
1090
6
      Var->setName(Twine(ManagedVar->getName()) + ".managed");
1091
      // Keep managed variables even if they are not used in device code since
1092
      // they need to be allocated by the runtime.
1093
6
      if (!Var->isDeclaration()) {
1094
4
        assert(!ManagedVar->isDeclaration());
1095
0
        CGM.addCompilerUsedGlobal(Var);
1096
4
        CGM.addCompilerUsedGlobal(ManagedVar);
1097
4
      }
1098
6
    }
1099
67
  }
1100
71
}
1101
1102
// Returns module constructor to be added.
1103
121
llvm::Function *CGNVCUDARuntime::finalizeModule() {
1104
121
  if (CGM.getLangOpts().CUDAIsDevice) {
1105
71
    transformManagedVars();
1106
1107
    // Mark ODR-used device variables as compiler used to prevent it from being
1108
    // eliminated by optimization. This is necessary for device variables
1109
    // ODR-used by host functions. Sema correctly marks them as ODR-used no
1110
    // matter whether they are ODR-used by device or host functions.
1111
    //
1112
    // We do not need to do this if the variable has used attribute since it
1113
    // has already been added.
1114
    //
1115
    // Static device variables have been externalized at this point, therefore
1116
    // variables with LLVM private or internal linkage need not be added.
1117
71
    for (auto &&Info : DeviceVars) {
1118
67
      auto Kind = Info.Flags.getKind();
1119
67
      if (!Info.Var->isDeclaration() &&
1120
67
          
!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage())65
&&
1121
67
          
(47
Kind == DeviceVarFlags::Variable47
||
1122
47
           
Kind == DeviceVarFlags::Surface0
||
1123
47
           
Kind == DeviceVarFlags::Texture0
) &&
1124
67
          
Info.D->isUsed()47
&&
!Info.D->hasAttr<UsedAttr>()22
) {
1125
21
        CGM.addCompilerUsedGlobal(Info.Var);
1126
21
      }
1127
67
    }
1128
71
    return nullptr;
1129
71
  }
1130
50
  return makeModuleCtorFunction();
1131
121
}
1132
1133
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1134
144
                                                    GlobalDecl GD) {
1135
144
  auto Loc = KernelHandles.find(F);
1136
144
  if (Loc != KernelHandles.end())
1137
85
    return Loc->second;
1138
1139
59
  if (!CGM.getLangOpts().HIP) {
1140
20
    KernelHandles[F] = F;
1141
20
    KernelStubs[F] = F;
1142
20
    return F;
1143
20
  }
1144
1145
39
  auto *Var = new llvm::GlobalVariable(
1146
39
      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1147
39
      /*Initializer=*/nullptr,
1148
39
      CGM.getMangledName(
1149
39
          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1150
39
  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1151
39
  Var->setDSOLocal(F->isDSOLocal());
1152
39
  Var->setVisibility(F->getVisibility());
1153
39
  CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
1154
39
  KernelHandles[F] = Var;
1155
39
  KernelStubs[Var] = F;
1156
39
  return Var;
1157
59
}