Coverage Report

Created: 2021-09-21 08:58

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line
Count
Source (jump to first uncovered line)
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "CGCUDARuntime.h"
15
#include "CGCXXABI.h"
16
#include "CodeGenFunction.h"
17
#include "CodeGenModule.h"
18
#include "clang/AST/Decl.h"
19
#include "clang/Basic/Cuda.h"
20
#include "clang/CodeGen/CodeGenABITypes.h"
21
#include "clang/CodeGen/ConstantInitBuilder.h"
22
#include "llvm/IR/BasicBlock.h"
23
#include "llvm/IR/Constants.h"
24
#include "llvm/IR/DerivedTypes.h"
25
#include "llvm/IR/ReplaceConstant.h"
26
#include "llvm/Support/Format.h"
27
28
using namespace clang;
29
using namespace CodeGen;
30
31
namespace {
32
constexpr unsigned CudaFatMagic = 0x466243b1;
33
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
34
35
class CGNVCUDARuntime : public CGCUDARuntime {
36
37
private:
38
  llvm::IntegerType *IntTy, *SizeTy;
39
  llvm::Type *VoidTy;
40
  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
41
42
  /// Convenience reference to LLVM Context
43
  llvm::LLVMContext &Context;
44
  /// Convenience reference to the current module
45
  llvm::Module &TheModule;
46
  /// Keeps track of kernel launch stubs and handles emitted in this module
47
  struct KernelInfo {
48
    llvm::Function *Kernel; // stub function to help launch kernel
49
    const Decl *D;
50
  };
51
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
52
  // Map a device stub function to a symbol for identifying kernel in host code.
53
  // For CUDA, the symbol for identifying the kernel is the same as the device
54
  // stub function. For HIP, they are different.
55
  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
56
  // Map a kernel handle to the kernel stub.
57
  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
58
  struct VarInfo {
59
    llvm::GlobalVariable *Var;
60
    const VarDecl *D;
61
    DeviceVarFlags Flags;
62
  };
63
  llvm::SmallVector<VarInfo, 16> DeviceVars;
64
  /// Keeps track of variable containing handle of GPU binary. Populated by
65
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
66
  /// ModuleDtorFunction()
67
  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
68
  /// Whether we generate relocatable device code.
69
  bool RelocatableDeviceCode;
70
  /// Mangle context for device.
71
  std::unique_ptr<MangleContext> DeviceMC;
72
73
  llvm::FunctionCallee getSetupArgumentFn() const;
74
  llvm::FunctionCallee getLaunchFn() const;
75
76
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
77
  llvm::FunctionType *getCallbackFnTy() const;
78
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
79
  std::string addPrefixToName(StringRef FuncName) const;
80
  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
81
82
  /// Creates a function to register all kernel stubs generated in this module.
83
  llvm::Function *makeRegisterGlobalsFn();
84
85
  /// Helper function that generates a constant string and returns a pointer to
86
  /// the start of the string.  The result of this function can be used anywhere
87
  /// where the C code specifies const char*.
88
  llvm::Constant *makeConstantString(const std::string &Str,
89
                                     const std::string &Name = "",
90
                                     const std::string &SectionName = "",
91
186
                                     unsigned Alignment = 0) {
92
186
    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
93
186
                               llvm::ConstantInt::get(SizeTy, 0)};
94
186
    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95
186
    llvm::GlobalVariable *GV =
96
186
        cast<llvm::GlobalVariable>(ConstStr.getPointer());
97
186
    if (!SectionName.empty()) {
98
21
      GV->setSection(SectionName);
99
      // Mark the address as used which make sure that this section isn't
100
      // merged and we will really have it in the object file.
101
21
      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
102
21
    }
103
186
    if (Alignment)
104
21
      GV->setAlignment(llvm::Align(Alignment));
105
106
186
    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
107
186
                                                ConstStr.getPointer(), Zeros);
108
186
  }
109
110
  /// Helper function that generates an empty dummy function returning void.
111
3
  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
112
3
    assert(FnTy->getReturnType()->isVoidTy() &&
113
3
           "Can only generate dummy functions returning void!");
114
0
    llvm::Function *DummyFunc = llvm::Function::Create(
115
3
        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
116
117
3
    llvm::BasicBlock *DummyBlock =
118
3
        llvm::BasicBlock::Create(Context, "", DummyFunc);
119
3
    CGBuilderTy FuncBuilder(CGM, Context);
120
3
    FuncBuilder.SetInsertPoint(DummyBlock);
121
3
    FuncBuilder.CreateRetVoid();
122
123
3
    return DummyFunc;
124
3
  }
125
126
  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
127
  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
128
  std::string getDeviceSideName(const NamedDecl *ND) override;
129
130
  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
131
363
                         bool Extern, bool Constant) {
132
363
    DeviceVars.push_back({&Var,
133
363
                          VD,
134
363
                          {DeviceVarFlags::Variable, Extern, Constant,
135
363
                           VD->hasAttr<HIPManagedAttr>(),
136
363
                           /*Normalized*/ false, 0}});
137
363
  }
138
  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
139
2
                          bool Extern, int Type) {
140
2
    DeviceVars.push_back({&Var,
141
2
                          VD,
142
2
                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
143
2
                           /*Managed*/ false,
144
2
                           /*Normalized*/ false, Type}});
145
2
  }
146
  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
147
4
                         bool Extern, int Type, bool Normalized) {
148
4
    DeviceVars.push_back({&Var,
149
4
                          VD,
150
4
                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
151
4
                           /*Managed*/ false, Normalized, Type}});
152
4
  }
153
154
  /// Creates module constructor function
155
  llvm::Function *makeModuleCtorFunction();
156
  /// Creates module destructor function
157
  llvm::Function *makeModuleDtorFunction();
158
  /// Transform managed variables for device compilation.
159
  void transformManagedVars();
160
161
public:
162
  CGNVCUDARuntime(CodeGenModule &CGM);
163
164
  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
165
36
  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
166
36
    auto Loc = KernelStubs.find(Handle);
167
36
    assert(Loc != KernelStubs.end());
168
0
    return Loc->second;
169
36
  }
170
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
171
  void handleVarRegistration(const VarDecl *VD,
172
                             llvm::GlobalVariable &Var) override;
173
  void
174
  internalizeDeviceSideVar(const VarDecl *D,
175
                           llvm::GlobalValue::LinkageTypes &Linkage) override;
176
177
  llvm::Function *finalizeModule() override;
178
};
179
180
}
181
182
63
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
183
63
  if (CGM.getLangOpts().HIP)
184
43
    return ((Twine("hip") + Twine(FuncName)).str());
185
20
  return ((Twine("cuda") + Twine(FuncName)).str());
186
63
}
187
std::string
188
463
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
189
463
  if (CGM.getLangOpts().HIP)
190
342
    return ((Twine("__hip") + Twine(FuncName)).str());
191
121
  return ((Twine("__cuda") + Twine(FuncName)).str());
192
463
}
193
194
226
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
195
  // If the host and device have different C++ ABIs, mark it as the device
196
  // mangle context so that the mangling needs to retrieve the additional
197
  // device lambda mangling number instead of the regular host one.
198
226
  if (CGM.getContext().getAuxTargetInfo() &&
199
226
      
CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft()13
&&
200
226
      
CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()3
) {
201
3
    return std::unique_ptr<MangleContext>(
202
3
        CGM.getContext().createDeviceMangleContext(
203
3
            *CGM.getContext().getAuxTargetInfo()));
204
3
  }
205
206
223
  return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
207
223
      CGM.getContext().getAuxTargetInfo()));
208
226
}
209
210
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
211
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
212
      TheModule(CGM.getModule()),
213
      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
214
226
      DeviceMC(InitDeviceMC(CGM)) {
215
226
  CodeGen::CodeGenTypes &Types = CGM.getTypes();
216
226
  ASTContext &Ctx = CGM.getContext();
217
218
226
  IntTy = CGM.IntTy;
219
226
  SizeTy = CGM.SizeTy;
220
226
  VoidTy = CGM.VoidTy;
221
222
226
  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
223
226
  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
224
226
  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
225
226
}
226
227
56
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
228
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
229
56
  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
230
56
  return CGM.CreateRuntimeFunction(
231
56
      llvm::FunctionType::get(IntTy, Params, false),
232
56
      addPrefixToName("SetupArgument"));
233
56
}
234
235
56
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
236
56
  if (CGM.getLangOpts().HIP) {
237
    // hipError_t hipLaunchByPtr(char *);
238
42
    return CGM.CreateRuntimeFunction(
239
42
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
240
42
  } else {
241
    // cudaError_t cudaLaunch(char *);
242
14
    return CGM.CreateRuntimeFunction(
243
14
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
244
14
  }
245
56
}
246
247
45
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
248
45
  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
249
45
}
250
251
6
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
252
6
  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
253
6
}
254
255
3
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
256
3
  auto CallbackFnTy = getCallbackFnTy();
257
3
  auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
258
3
  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
259
3
                          VoidPtrTy, CallbackFnTy->getPointerTo()};
260
3
  return llvm::FunctionType::get(VoidTy, Params, false);
261
3
}
262
263
271
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
264
271
  GlobalDecl GD;
265
  // D could be either a kernel or a variable.
266
271
  if (auto *FD = dyn_cast<FunctionDecl>(ND))
267
156
    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
268
115
  else
269
115
    GD = GlobalDecl(ND);
270
271
  std::string DeviceSideName;
271
271
  MangleContext *MC;
272
271
  if (CGM.getLangOpts().CUDAIsDevice)
273
0
    MC = &CGM.getCXXABI().getMangleContext();
274
271
  else
275
271
    MC = DeviceMC.get();
276
271
  if (MC->shouldMangleDeclName(ND)) {
277
182
    SmallString<256> Buffer;
278
182
    llvm::raw_svector_ostream Out(Buffer);
279
182
    MC->mangleName(GD, Out);
280
182
    DeviceSideName = std::string(Out.str());
281
182
  } else
282
89
    DeviceSideName = std::string(ND->getIdentifier()->getName());
283
284
  // Make unique name for device side static file-scope variable for HIP.
285
271
  if (CGM.getContext().shouldExternalizeStaticVar(ND) &&
286
271
      
CGM.getLangOpts().GPURelocatableDeviceCode30
&&
287
271
      
!CGM.getLangOpts().CUID.empty()12
) {
288
8
    SmallString<256> Buffer;
289
8
    llvm::raw_svector_ostream Out(Buffer);
290
8
    Out << DeviceSideName;
291
8
    CGM.printPostfixForExternalizedStaticVar(Out);
292
8
    DeviceSideName = std::string(Out.str());
293
8
  }
294
271
  return DeviceSideName;
295
271
}
296
297
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
298
63
                                     FunctionArgList &Args) {
299
63
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
300
63
  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
301
43
    GV->setLinkage(CGF.CurFn->getLinkage());
302
43
    GV->setInitializer(CGF.CurFn);
303
43
  }
304
63
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
305
63
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
306
63
      
(57
CGF.getLangOpts().HIP57
&&
CGF.getLangOpts().HIPUseNewLaunchAPI43
))
307
7
    emitDeviceStubBodyNew(CGF, Args);
308
56
  else
309
56
    emitDeviceStubBodyLegacy(CGF, Args);
310
63
}
311
312
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
313
// array and kernels are launched using cudaLaunchKernel().
314
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
315
7
                                            FunctionArgList &Args) {
316
  // Build the shadow stack entry at the very start of the function.
317
318
  // Calculate amount of space we will need for all arguments.  If we have no
319
  // args, allocate a single pointer so we still have a valid pointer to the
320
  // argument array that we can pass to runtime, even if it will be unused.
321
7
  Address KernelArgs = CGF.CreateTempAlloca(
322
7
      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
323
7
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
324
  // Store pointers to the arguments in a locally allocated launch_args.
325
24
  for (unsigned i = 0; i < Args.size(); 
++i17
) {
326
17
    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
327
17
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
328
17
    CGF.Builder.CreateDefaultAlignedStore(
329
17
        VoidVarPtr,
330
17
        CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
331
17
  }
332
333
7
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
334
335
  // Lookup cudaLaunchKernel/hipLaunchKernel function.
336
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
337
  //                              void **args, size_t sharedMem,
338
  //                              cudaStream_t stream);
339
  // hipError_t hipLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
340
  //                            void **args, size_t sharedMem,
341
  //                            hipStream_t stream);
342
7
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
343
7
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
344
7
  auto LaunchKernelName = addPrefixToName("LaunchKernel");
345
7
  IdentifierInfo &cudaLaunchKernelII =
346
7
      CGM.getContext().Idents.get(LaunchKernelName);
347
7
  FunctionDecl *cudaLaunchKernelFD = nullptr;
348
7
  for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
349
7
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
350
7
      cudaLaunchKernelFD = FD;
351
7
  }
352
353
7
  if (cudaLaunchKernelFD == nullptr) {
354
0
    CGM.Error(CGF.CurFuncDecl->getLocation(),
355
0
              "Can't find declaration for " + LaunchKernelName);
356
0
    return;
357
0
  }
358
  // Create temporary dim3 grid_dim, block_dim.
359
7
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
360
7
  QualType Dim3Ty = GridDimParam->getType();
361
7
  Address GridDim =
362
7
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
363
7
  Address BlockDim =
364
7
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
365
7
  Address ShmemSize =
366
7
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
367
7
  Address Stream =
368
7
      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
369
7
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
370
7
      llvm::FunctionType::get(IntTy,
371
7
                              {/*gridDim=*/GridDim.getType(),
372
7
                               /*blockDim=*/BlockDim.getType(),
373
7
                               /*ShmemSize=*/ShmemSize.getType(),
374
7
                               /*Stream=*/Stream.getType()},
375
7
                              /*isVarArg=*/false),
376
7
      addUnderscoredPrefixToName("PopCallConfiguration"));
377
378
7
  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
379
7
                              {GridDim.getPointer(), BlockDim.getPointer(),
380
7
                               ShmemSize.getPointer(), Stream.getPointer()});
381
382
  // Emit the call to cudaLaunch
383
7
  llvm::Value *Kernel =
384
7
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
385
7
  CallArgList LaunchKernelArgs;
386
7
  LaunchKernelArgs.add(RValue::get(Kernel),
387
7
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
388
7
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
389
7
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
390
7
  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
391
7
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
392
7
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
393
7
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
394
7
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
395
7
                       cudaLaunchKernelFD->getParamDecl(5)->getType());
396
397
7
  QualType QT = cudaLaunchKernelFD->getType();
398
7
  QualType CQT = QT.getCanonicalType();
399
7
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
400
7
  llvm::FunctionType *FTy = dyn_cast<llvm::FunctionType>(Ty);
401
402
7
  const CGFunctionInfo &FI =
403
7
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
404
7
  llvm::FunctionCallee cudaLaunchKernelFn =
405
7
      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
406
7
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
407
7
               LaunchKernelArgs);
408
7
  CGF.EmitBranch(EndBlock);
409
410
7
  CGF.EmitBlock(EndBlock);
411
7
}
412
413
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
414
56
                                               FunctionArgList &Args) {
415
  // Emit a call to cudaSetupArgument for each arg in Args.
416
56
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
417
56
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
418
56
  CharUnits Offset = CharUnits::Zero();
419
75
  for (const VarDecl *A : Args) {
420
75
    auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
421
75
    Offset = Offset.alignTo(TInfo.Align);
422
75
    llvm::Value *Args[] = {
423
75
        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
424
75
                                      VoidPtrTy),
425
75
        llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
426
75
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
427
75
    };
428
75
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
429
75
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
430
75
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
431
75
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
432
75
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
433
75
    CGF.EmitBlock(NextBlock);
434
75
    Offset += TInfo.Width;
435
75
  }
436
437
  // Emit the call to cudaLaunch
438
56
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
439
56
  llvm::Value *Arg =
440
56
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
441
56
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
442
56
  CGF.EmitBranch(EndBlock);
443
444
56
  CGF.EmitBlock(EndBlock);
445
56
}
446
447
// Replace the original variable Var with the address loaded from variable
448
// ManagedVar populated by HIP runtime.
449
static void replaceManagedVar(llvm::GlobalVariable *Var,
450
34
                              llvm::GlobalVariable *ManagedVar) {
451
34
  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
452
40
  for (auto &&VarUse : Var->uses()) {
453
40
    WorkList.push_back({VarUse.getUser()});
454
40
  }
455
128
  while (!WorkList.empty()) {
456
94
    auto &&WorkItem = WorkList.pop_back_val();
457
94
    auto *U = WorkItem.back();
458
94
    if (isa<llvm::ConstantExpr>(U)) {
459
54
      for (auto &&UU : U->uses()) {
460
54
        WorkItem.push_back(UU.getUser());
461
54
        WorkList.push_back(WorkItem);
462
54
        WorkItem.pop_back();
463
54
      }
464
48
      continue;
465
48
    }
466
46
    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
467
46
      llvm::Value *OldV = Var;
468
46
      llvm::Instruction *NewV =
469
46
          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
470
46
                             llvm::Align(Var->getAlignment()), I);
471
46
      WorkItem.pop_back();
472
      // Replace constant expressions directly or indirectly using the managed
473
      // variable with instructions.
474
48
      for (auto &&Op : WorkItem) {
475
48
        auto *CE = cast<llvm::ConstantExpr>(Op);
476
48
        auto *NewInst = llvm::createReplacementInstr(CE, I);
477
48
        NewInst->replaceUsesOfWith(OldV, NewV);
478
48
        OldV = CE;
479
48
        NewV = NewInst;
480
48
      }
481
46
      I->replaceUsesOfWith(OldV, NewV);
482
46
    } else {
483
0
      llvm_unreachable("Invalid use of managed variable");
484
0
    }
485
46
  }
486
34
}
487
488
/// Creates a function that sets up state on the host side for CUDA objects that
489
/// have a presence on both the host and device sides. Specifically, registers
490
/// the host side of kernel functions and device global variables with the CUDA
491
/// runtime.
492
/// \code
493
/// void __cuda_register_globals(void** GpuBinaryHandle) {
494
///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
495
///    ...
496
///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
497
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
498
///    ...
499
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
500
/// }
501
/// \endcode
502
42
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
503
  // No need to register anything
504
42
  if (EmittedKernels.empty() && 
DeviceVars.empty()6
)
505
0
    return nullptr;
506
507
42
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
508
42
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
509
42
      addUnderscoredPrefixToName("_register_globals"), &TheModule);
510
42
  llvm::BasicBlock *EntryBB =
511
42
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
512
42
  CGBuilderTy Builder(CGM, Context);
513
42
  Builder.SetInsertPoint(EntryBB);
514
515
  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
516
  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
517
42
  llvm::Type *RegisterFuncParams[] = {
518
42
      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
519
42
      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
520
42
  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
521
42
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
522
42
      addUnderscoredPrefixToName("RegisterFunction"));
523
524
  // Extract GpuBinaryHandle passed as the first argument passed to
525
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
526
  // each emitted kernel.
527
42
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
528
52
  for (auto &&I : EmittedKernels) {
529
52
    llvm::Constant *KernelName =
530
52
        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
531
52
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
532
52
    llvm::Value *Args[] = {
533
52
        &GpuBinaryHandlePtr,
534
52
        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
535
52
        KernelName,
536
52
        KernelName,
537
52
        llvm::ConstantInt::get(IntTy, -1),
538
52
        NullPtr,
539
52
        NullPtr,
540
52
        NullPtr,
541
52
        NullPtr,
542
52
        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
543
52
    Builder.CreateCall(RegisterFunc, Args);
544
52
  }
545
546
42
  llvm::Type *VarSizeTy = IntTy;
547
  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
548
42
  if (CGM.getLangOpts().HIP ||
549
42
      
ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_9011
)
550
35
    VarSizeTy = SizeTy;
551
552
  // void __cudaRegisterVar(void **, char *, char *, const char *,
553
  //                        int, int, int, int)
554
42
  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
555
42
                                     CharPtrTy,    IntTy,     VarSizeTy,
556
42
                                     IntTy,        IntTy};
557
42
  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
558
42
      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
559
42
      addUnderscoredPrefixToName("RegisterVar"));
560
  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
561
  //                              size_t, unsigned)
562
42
  llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
563
42
                                            CharPtrTy,    VarSizeTy, IntTy};
564
42
  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
565
42
      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
566
42
      addUnderscoredPrefixToName("RegisterManagedVar"));
567
  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
568
  //                            const void **, const char *, int, int);
569
42
  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
570
42
      llvm::FunctionType::get(
571
42
          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
572
42
          false),
573
42
      addUnderscoredPrefixToName("RegisterSurface"));
574
  // void __cudaRegisterTexture(void **, const struct textureReference *,
575
  //                            const void **, const char *, int, int, int)
576
42
  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
577
42
      llvm::FunctionType::get(
578
42
          VoidTy,
579
42
          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
580
42
          false),
581
42
      addUnderscoredPrefixToName("RegisterTexture"));
582
113
  for (auto &&Info : DeviceVars) {
583
113
    llvm::GlobalVariable *Var = Info.Var;
584
113
    assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
585
113
           "External variables should not show up here, except HIP managed "
586
113
           "variables");
587
0
    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
588
113
    switch (Info.Flags.getKind()) {
589
110
    case DeviceVarFlags::Variable: {
590
110
      uint64_t VarSize =
591
110
          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
592
110
      if (Info.Flags.isManaged()) {
593
16
        auto ManagedVar = new llvm::GlobalVariable(
594
16
            CGM.getModule(), Var->getType(),
595
16
            /*isConstant=*/false, Var->getLinkage(),
596
16
            /*Init=*/Var->isDeclaration()
597
16
                ? 
nullptr4
598
16
                : 
llvm::ConstantPointerNull::get(Var->getType())12
,
599
16
            /*Name=*/"", /*InsertBefore=*/nullptr,
600
16
            llvm::GlobalVariable::NotThreadLocal);
601
16
        ManagedVar->setDSOLocal(Var->isDSOLocal());
602
16
        ManagedVar->setVisibility(Var->getVisibility());
603
16
        ManagedVar->setExternallyInitialized(true);
604
16
        ManagedVar->takeName(Var);
605
16
        Var->setName(Twine(ManagedVar->getName() + ".managed"));
606
16
        replaceManagedVar(Var, ManagedVar);
607
16
        llvm::Value *Args[] = {
608
16
            &GpuBinaryHandlePtr,
609
16
            Builder.CreateBitCast(ManagedVar, VoidPtrTy),
610
16
            Builder.CreateBitCast(Var, VoidPtrTy),
611
16
            VarName,
612
16
            llvm::ConstantInt::get(VarSizeTy, VarSize),
613
16
            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
614
16
        if (!Var->isDeclaration())
615
12
          Builder.CreateCall(RegisterManagedVar, Args);
616
94
      } else {
617
94
        llvm::Value *Args[] = {
618
94
            &GpuBinaryHandlePtr,
619
94
            Builder.CreateBitCast(Var, VoidPtrTy),
620
94
            VarName,
621
94
            VarName,
622
94
            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
623
94
            llvm::ConstantInt::get(VarSizeTy, VarSize),
624
94
            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
625
94
            llvm::ConstantInt::get(IntTy, 0)};
626
94
        Builder.CreateCall(RegisterVar, Args);
627
94
      }
628
110
      break;
629
0
    }
630
1
    case DeviceVarFlags::Surface:
631
1
      Builder.CreateCall(
632
1
          RegisterSurf,
633
1
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
634
1
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
635
1
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
636
1
      break;
637
2
    case DeviceVarFlags::Texture:
638
2
      Builder.CreateCall(
639
2
          RegisterTex,
640
2
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
641
2
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
642
2
           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
643
2
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
644
2
      break;
645
113
    }
646
113
  }
647
648
42
  Builder.CreateRetVoid();
649
42
  return RegisterKernelsFunc;
650
42
}
651
652
/// Creates a global constructor function for the module:
653
///
654
/// For CUDA:
655
/// \code
656
/// void __cuda_module_ctor(void*) {
657
///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
658
///     __cuda_register_globals(Handle);
659
/// }
660
/// \endcode
661
///
662
/// For HIP:
663
/// \code
664
/// void __hip_module_ctor(void*) {
665
///     if (__hip_gpubin_handle == 0) {
666
///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
667
///         __hip_register_globals(__hip_gpubin_handle);
668
///     }
669
/// }
670
/// \endcode
671
66
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
672
66
  bool IsHIP = CGM.getLangOpts().HIP;
673
66
  bool IsCUDA = CGM.getLangOpts().CUDA;
674
  // No need to generate ctors/dtors if there is no GPU binary.
675
66
  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
676
66
  if (CudaGpuBinaryFileName.empty() && 
!IsHIP45
)
677
18
    return nullptr;
678
48
  if ((IsHIP || 
(13
IsCUDA13
&&
!RelocatableDeviceCode13
)) &&
EmittedKernels.empty()45
&&
679
48
      
DeviceVars.empty()12
)
680
6
    return nullptr;
681
682
  // void __{cuda|hip}_register_globals(void* handle);
683
42
  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
684
  // We always need a function to pass in as callback. Create a dummy
685
  // implementation if we don't need to register anything.
686
42
  if (RelocatableDeviceCode && 
!RegisterGlobalsFunc10
)
687
0
    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
688
689
  // void ** __{cuda|hip}RegisterFatBinary(void *);
690
42
  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
691
42
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
692
42
      addUnderscoredPrefixToName("RegisterFatBinary"));
693
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
694
42
  llvm::StructType *FatbinWrapperTy =
695
42
      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
696
697
  // Register GPU binary with the CUDA runtime, store returned handle in a
698
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
699
  // in destructor on exit. Then associate all known kernels with the GPU binary
700
  // handle so CUDA runtime can figure out what to call on the GPU side.
701
42
  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
702
42
  if (!CudaGpuBinaryFileName.empty()) {
703
18
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
704
18
        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
705
18
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
706
0
      CGM.getDiags().Report(diag::err_cannot_open_file)
707
0
          << CudaGpuBinaryFileName << EC.message();
708
0
      return nullptr;
709
0
    }
710
18
    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
711
18
  }
712
713
42
  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
714
42
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
715
42
      llvm::GlobalValue::InternalLinkage,
716
42
      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
717
42
  llvm::BasicBlock *CtorEntryBB =
718
42
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
719
42
  CGBuilderTy CtorBuilder(CGM, Context);
720
721
42
  CtorBuilder.SetInsertPoint(CtorEntryBB);
722
723
42
  const char *FatbinConstantName;
724
42
  const char *FatbinSectionName;
725
42
  const char *ModuleIDSectionName;
726
42
  StringRef ModuleIDPrefix;
727
42
  llvm::Constant *FatBinStr;
728
42
  unsigned FatMagic;
729
42
  if (IsHIP) {
730
31
    FatbinConstantName = ".hip_fatbin";
731
31
    FatbinSectionName = ".hipFatBinSegment";
732
733
31
    ModuleIDSectionName = "__hip_module_id";
734
31
    ModuleIDPrefix = "__hip_";
735
736
31
    if (CudaGpuBinary) {
737
      // If fatbin is available from early finalization, create a string
738
      // literal containing the fat binary loaded from the given file.
739
7
      const unsigned HIPCodeObjectAlign = 4096;
740
7
      FatBinStr =
741
7
          makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
742
7
                             FatbinConstantName, HIPCodeObjectAlign);
743
24
    } else {
744
      // If fatbin is not available, create an external symbol
745
      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
746
      // to contain the fat binary but will be populated somewhere else,
747
      // e.g. by lld through link script.
748
24
      FatBinStr = new llvm::GlobalVariable(
749
24
        CGM.getModule(), CGM.Int8Ty,
750
24
        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
751
24
        "__hip_fatbin", nullptr,
752
24
        llvm::GlobalVariable::NotThreadLocal);
753
24
      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
754
24
    }
755
756
31
    FatMagic = HIPFatMagic;
757
31
  } else {
758
11
    if (RelocatableDeviceCode)
759
3
      FatbinConstantName = CGM.getTriple().isMacOSX()
760
3
                               ? 
"__NV_CUDA,__nv_relfatbin"0
761
3
                               : "__nv_relfatbin";
762
8
    else
763
8
      FatbinConstantName =
764
8
          CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__nv_fatbin"0
: ".nv_fatbin";
765
    // NVIDIA's cuobjdump looks for fatbins in this section.
766
11
    FatbinSectionName =
767
11
        CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__fatbin"0
: ".nvFatBinSegment";
768
769
11
    ModuleIDSectionName = CGM.getTriple().isMacOSX()
770
11
                              ? 
"__NV_CUDA,__nv_module_id"0
771
11
                              : "__nv_module_id";
772
11
    ModuleIDPrefix = "__nv_";
773
774
    // For CUDA, create a string literal containing the fat binary loaded from
775
    // the given file.
776
11
    FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
777
11
                                   FatbinConstantName, 8);
778
11
    FatMagic = CudaFatMagic;
779
11
  }
780
781
  // Create initialized wrapper structure that points to the loaded GPU binary
782
42
  ConstantInitBuilder Builder(CGM);
783
42
  auto Values = Builder.beginStruct(FatbinWrapperTy);
784
  // Fatbin wrapper magic.
785
42
  Values.addInt(IntTy, FatMagic);
786
  // Fatbin version.
787
42
  Values.addInt(IntTy, 1);
788
  // Data.
789
42
  Values.add(FatBinStr);
790
  // Unused in fatbin v1.
791
42
  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
792
42
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
793
42
      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
794
42
      /*constant*/ true);
795
42
  FatbinWrapper->setSection(FatbinSectionName);
796
797
  // There is only one HIP fat binary per linked module, however there are
798
  // multiple constructor functions. Make sure the fat binary is registered
799
  // only once. The constructor functions are executed by the dynamic loader
800
  // before the program gains control. The dynamic loader cannot execute the
801
  // constructor functions concurrently since doing that would not guarantee
802
  // thread safety of the loaded program. Therefore we can assume sequential
803
  // execution of constructor functions here.
804
42
  if (IsHIP) {
805
31
    auto Linkage = CudaGpuBinary ? 
llvm::GlobalValue::InternalLinkage7
:
806
31
        
llvm::GlobalValue::LinkOnceAnyLinkage24
;
807
31
    llvm::BasicBlock *IfBlock =
808
31
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
809
31
    llvm::BasicBlock *ExitBlock =
810
31
        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
811
    // The name, size, and initialization pattern of this variable is part
812
    // of HIP ABI.
813
31
    GpuBinaryHandle = new llvm::GlobalVariable(
814
31
        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
815
31
        Linkage,
816
31
        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
817
31
        "__hip_gpubin_handle");
818
31
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
819
    // Prevent the weak symbol in different shared libraries being merged.
820
31
    if (Linkage != llvm::GlobalValue::InternalLinkage)
821
24
      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
822
31
    Address GpuBinaryAddr(
823
31
        GpuBinaryHandle,
824
31
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
825
31
    {
826
31
      auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
827
31
      llvm::Constant *Zero =
828
31
          llvm::Constant::getNullValue(HandleValue->getType());
829
31
      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
830
31
      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
831
31
    }
832
31
    {
833
31
      CtorBuilder.SetInsertPoint(IfBlock);
834
      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
835
31
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
836
31
          RegisterFatbinFunc,
837
31
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
838
31
      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
839
31
      CtorBuilder.CreateBr(ExitBlock);
840
31
    }
841
31
    {
842
31
      CtorBuilder.SetInsertPoint(ExitBlock);
843
      // Call __hip_register_globals(GpuBinaryHandle);
844
31
      if (RegisterGlobalsFunc) {
845
31
        auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
846
31
        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
847
31
      }
848
31
    }
849
31
  } else 
if (11
!RelocatableDeviceCode11
) {
850
    // Register binary with CUDA runtime. This is substantially different in
851
    // default mode vs. separate compilation!
852
    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
853
8
    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
854
8
        RegisterFatbinFunc,
855
8
        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
856
8
    GpuBinaryHandle = new llvm::GlobalVariable(
857
8
        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
858
8
        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
859
8
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
860
8
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
861
8
                                   CGM.getPointerAlign());
862
863
    // Call __cuda_register_globals(GpuBinaryHandle);
864
8
    if (RegisterGlobalsFunc)
865
8
      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
866
867
    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
868
8
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
869
8
                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
870
      // void __cudaRegisterFatBinaryEnd(void **);
871
0
      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
872
0
          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
873
0
          "__cudaRegisterFatBinaryEnd");
874
0
      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
875
0
    }
876
8
  } else {
877
    // Generate a unique module ID.
878
3
    SmallString<64> ModuleID;
879
3
    llvm::raw_svector_ostream OS(ModuleID);
880
3
    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
881
3
    llvm::Constant *ModuleIDConstant = makeConstantString(
882
3
        std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
883
884
    // Create an alias for the FatbinWrapper that nvcc will look for.
885
3
    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
886
3
                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
887
888
    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
889
    // void *, void (*)(void **))
890
3
    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
891
3
    RegisterLinkedBinaryName += ModuleID;
892
3
    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
893
3
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
894
895
3
    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
896
0
    llvm::Value *Args[] = {RegisterGlobalsFunc,
897
3
                           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
898
3
                           ModuleIDConstant,
899
3
                           makeDummyFunction(getCallbackFnTy())};
900
3
    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
901
3
  }
902
903
  // Create destructor and register it with atexit() the way NVCC does it. Doing
904
  // it during regular destructor phase worked in CUDA before 9.2 but results in
905
  // double-free in 9.2.
906
42
  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
907
    // extern "C" int atexit(void (*f)(void));
908
39
    llvm::FunctionType *AtExitTy =
909
39
        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
910
39
    llvm::FunctionCallee AtExitFunc =
911
39
        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
912
39
                                  /*Local=*/true);
913
39
    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
914
39
  }
915
916
42
  CtorBuilder.CreateRetVoid();
917
42
  return ModuleCtorFunc;
918
42
}
919
920
/// Creates a global destructor function that unregisters the GPU code blob
921
/// registered by constructor.
922
///
923
/// For CUDA:
924
/// \code
925
/// void __cuda_module_dtor(void*) {
926
///     __cudaUnregisterFatBinary(Handle);
927
/// }
928
/// \endcode
929
///
930
/// For HIP:
931
/// \code
932
/// void __hip_module_dtor(void*) {
933
///     if (__hip_gpubin_handle) {
934
///         __hipUnregisterFatBinary(__hip_gpubin_handle);
935
///         __hip_gpubin_handle = 0;
936
///     }
937
/// }
938
/// \endcode
939
42
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
940
  // No need for destructor if we don't have a handle to unregister.
941
42
  if (!GpuBinaryHandle)
942
3
    return nullptr;
943
944
  // void __cudaUnregisterFatBinary(void ** handle);
945
39
  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
946
39
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
947
39
      addUnderscoredPrefixToName("UnregisterFatBinary"));
948
949
39
  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
950
39
      llvm::FunctionType::get(VoidTy, VoidPtrTy, false),
951
39
      llvm::GlobalValue::InternalLinkage,
952
39
      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
953
954
39
  llvm::BasicBlock *DtorEntryBB =
955
39
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
956
39
  CGBuilderTy DtorBuilder(CGM, Context);
957
39
  DtorBuilder.SetInsertPoint(DtorEntryBB);
958
959
39
  Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity(
960
39
                                             GpuBinaryHandle->getAlignment()));
961
39
  auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
962
  // There is only one HIP fat binary per linked module, however there are
963
  // multiple destructor functions. Make sure the fat binary is unregistered
964
  // only once.
965
39
  if (CGM.getLangOpts().HIP) {
966
31
    llvm::BasicBlock *IfBlock =
967
31
        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
968
31
    llvm::BasicBlock *ExitBlock =
969
31
        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
970
31
    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
971
31
    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
972
31
    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
973
974
31
    DtorBuilder.SetInsertPoint(IfBlock);
975
31
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
976
31
    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
977
31
    DtorBuilder.CreateBr(ExitBlock);
978
979
31
    DtorBuilder.SetInsertPoint(ExitBlock);
980
31
  } else {
981
8
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
982
8
  }
983
39
  DtorBuilder.CreateRetVoid();
984
39
  return ModuleDtorFunc;
985
42
}
986
987
226
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
988
226
  return new CGNVCUDARuntime(CGM);
989
226
}
990
991
void CGNVCUDARuntime::internalizeDeviceSideVar(
992
208
    const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
993
  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
994
  // global variables become internal definitions. These have to be internal in
995
  // order to prevent name conflicts with global host variables with the same
996
  // name in a different TUs.
997
  //
998
  // For -fgpu-rdc, the shadow variables should not be internalized because
999
  // they may be accessed by different TU.
1000
208
  if (CGM.getLangOpts().GPURelocatableDeviceCode)
1001
48
    return;
1002
1003
  // __shared__ variables are odd. Shadows do get created, but
1004
  // they are not registered with the CUDA runtime, so they
1005
  // can't really be used to access their device-side
1006
  // counterparts. It's not clear yet whether it's nvcc's bug or
1007
  // a feature, but we've got to do the same for compatibility.
1008
160
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()93
||
1009
160
      
D->hasAttr<CUDASharedAttr>()52
||
1010
160
      
D->getType()->isCUDADeviceBuiltinSurfaceType()33
||
1011
160
      
D->getType()->isCUDADeviceBuiltinTextureType()32
) {
1012
130
    Linkage = llvm::GlobalValue::InternalLinkage;
1013
130
  }
1014
160
}
1015
1016
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1017
479
                                            llvm::GlobalVariable &GV) {
1018
479
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()258
) {
1019
    // Shadow variables and their properties must be registered with CUDA
1020
    // runtime. Skip Extern global variables, which will be registered in
1021
    // the TU where they are defined.
1022
    //
1023
    // Don't register a C++17 inline variable. The local symbol can be
1024
    // discarded and referencing a discarded local symbol from outside the
1025
    // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1026
    //
1027
    // HIP managed variables need to be always recorded in device and host
1028
    // compilations for transformation.
1029
    //
1030
    // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1031
    // added to llvm.compiler-used, therefore they are safe to be registered.
1032
369
    if ((!D->hasExternalStorage() && 
!D->isInline()361
) ||
1033
369
        
CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D)24
||
1034
369
        
D->hasAttr<HIPManagedAttr>()14
) {
1035
363
      registerDeviceVar(D, GV, !D->hasDefinition(),
1036
363
                        D->hasAttr<CUDAConstantAttr>());
1037
363
    }
1038
369
  } else 
if (110
D->getType()->isCUDADeviceBuiltinSurfaceType()110
||
1039
110
             
D->getType()->isCUDADeviceBuiltinTextureType()108
) {
1040
    // Builtin surfaces and textures and their template arguments are
1041
    // also registered with CUDA runtime.
1042
6
    const auto *TD = cast<ClassTemplateSpecializationDecl>(
1043
6
        D->getType()->castAs<RecordType>()->getDecl());
1044
6
    const TemplateArgumentList &Args = TD->getTemplateArgs();
1045
6
    if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1046
2
      assert(Args.size() == 2 &&
1047
2
             "Unexpected number of template arguments of CUDA device "
1048
2
             "builtin surface type.");
1049
0
      auto SurfType = Args[1].getAsIntegral();
1050
2
      if (!D->hasExternalStorage())
1051
2
        registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1052
4
    } else {
1053
4
      assert(Args.size() == 3 &&
1054
4
             "Unexpected number of template arguments of CUDA device "
1055
4
             "builtin texture type.");
1056
0
      auto TexType = Args[1].getAsIntegral();
1057
4
      auto Normalized = Args[2].getAsIntegral();
1058
4
      if (!D->hasExternalStorage())
1059
4
        registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1060
4
                          Normalized.getZExtValue());
1061
4
    }
1062
6
  }
1063
479
}
1064
1065
// Transform managed variables to pointers to managed variables in device code.
1066
// Each use of the original managed variable is replaced by a load from the
1067
// transformed managed variable. The transformed managed variable contains
1068
// the address of managed memory which will be allocated by the runtime.
1069
143
void CGNVCUDARuntime::transformManagedVars() {
1070
217
  for (auto &&Info : DeviceVars) {
1071
217
    llvm::GlobalVariable *Var = Info.Var;
1072
217
    if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1073
217
        
Info.Flags.isManaged()214
) {
1074
18
      auto ManagedVar = new llvm::GlobalVariable(
1075
18
          CGM.getModule(), Var->getType(),
1076
18
          /*isConstant=*/false, Var->getLinkage(),
1077
18
          /*Init=*/Var->isDeclaration()
1078
18
              ? 
nullptr4
1079
18
              : 
llvm::ConstantPointerNull::get(Var->getType())14
,
1080
18
          /*Name=*/"", /*InsertBefore=*/nullptr,
1081
18
          llvm::GlobalVariable::NotThreadLocal,
1082
18
          CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1083
18
      ManagedVar->setDSOLocal(Var->isDSOLocal());
1084
18
      ManagedVar->setVisibility(Var->getVisibility());
1085
18
      ManagedVar->setExternallyInitialized(true);
1086
18
      replaceManagedVar(Var, ManagedVar);
1087
18
      ManagedVar->takeName(Var);
1088
18
      Var->setName(Twine(ManagedVar->getName()) + ".managed");
1089
      // Keep managed variables even if they are not used in device code since
1090
      // they need to be allocated by the runtime.
1091
18
      if (!Var->isDeclaration()) {
1092
14
        assert(!ManagedVar->isDeclaration());
1093
0
        CGM.addCompilerUsedGlobal(Var);
1094
14
        CGM.addCompilerUsedGlobal(ManagedVar);
1095
14
      }
1096
18
    }
1097
217
  }
1098
143
}
1099
1100
// Returns module constructor to be added.
1101
209
llvm::Function *CGNVCUDARuntime::finalizeModule() {
1102
209
  if (CGM.getLangOpts().CUDAIsDevice) {
1103
143
    transformManagedVars();
1104
1105
    // Mark ODR-used device variables as compiler used to prevent it from being
1106
    // eliminated by optimization. This is necessary for device variables
1107
    // ODR-used by host functions. Sema correctly marks them as ODR-used no
1108
    // matter whether they are ODR-used by device or host functions.
1109
    //
1110
    // We do not need to do this if the variable has used attribute since it
1111
    // has already been added.
1112
    //
1113
    // Static device variables have been externalized at this point, therefore
1114
    // variables with LLVM private or internal linkage need not be added.
1115
217
    for (auto &&Info : DeviceVars) {
1116
217
      auto Kind = Info.Flags.getKind();
1117
217
      if (!Info.Var->isDeclaration() &&
1118
217
          
!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage())213
&&
1119
217
          
(179
Kind == DeviceVarFlags::Variable179
||
1120
179
           
Kind == DeviceVarFlags::Surface3
||
1121
179
           
Kind == DeviceVarFlags::Texture2
) &&
1122
217
          
Info.D->isUsed()179
&&
!Info.D->hasAttr<UsedAttr>()84
) {
1123
81
        CGM.addCompilerUsedGlobal(Info.Var);
1124
81
      }
1125
217
    }
1126
143
    return nullptr;
1127
143
  }
1128
66
  return makeModuleCtorFunction();
1129
209
}
1130
1131
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1132
109
                                                    GlobalDecl GD) {
1133
109
  auto Loc = KernelHandles.find(F);
1134
109
  if (Loc != KernelHandles.end())
1135
45
    return Loc->second;
1136
1137
64
  if (!CGM.getLangOpts().HIP) {
1138
20
    KernelHandles[F] = F;
1139
20
    KernelStubs[F] = F;
1140
20
    return F;
1141
20
  }
1142
1143
44
  auto *Var = new llvm::GlobalVariable(
1144
44
      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1145
44
      /*Initializer=*/nullptr,
1146
44
      CGM.getMangledName(
1147
44
          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1148
44
  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1149
44
  Var->setDSOLocal(F->isDSOLocal());
1150
44
  Var->setVisibility(F->getVisibility());
1151
44
  KernelHandles[F] = Var;
1152
44
  KernelStubs[Var] = F;
1153
44
  return Var;
1154
64
}