Coverage Report

Created: 2022-07-16 07:03

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/CodeGen/CGCUDANV.cpp
Line
Count
Source (jump to first uncovered line)
1
//===----- CGCUDANV.cpp - Interface to NVIDIA CUDA Runtime ----------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This provides a class for CUDA code generation targeting the NVIDIA CUDA
10
// runtime library.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "CGCUDARuntime.h"
15
#include "CGCXXABI.h"
16
#include "CodeGenFunction.h"
17
#include "CodeGenModule.h"
18
#include "clang/AST/Decl.h"
19
#include "clang/Basic/Cuda.h"
20
#include "clang/CodeGen/CodeGenABITypes.h"
21
#include "clang/CodeGen/ConstantInitBuilder.h"
22
#include "llvm/IR/BasicBlock.h"
23
#include "llvm/IR/Constants.h"
24
#include "llvm/IR/DerivedTypes.h"
25
#include "llvm/IR/ReplaceConstant.h"
26
#include "llvm/Support/Format.h"
27
28
using namespace clang;
29
using namespace CodeGen;
30
31
namespace {
32
constexpr unsigned CudaFatMagic = 0x466243b1;
33
constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
34
35
class CGNVCUDARuntime : public CGCUDARuntime {
36
37
private:
38
  llvm::IntegerType *IntTy, *SizeTy;
39
  llvm::Type *VoidTy;
40
  llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy;
41
42
  /// Convenience reference to LLVM Context
43
  llvm::LLVMContext &Context;
44
  /// Convenience reference to the current module
45
  llvm::Module &TheModule;
46
  /// Keeps track of kernel launch stubs and handles emitted in this module
47
  struct KernelInfo {
48
    llvm::Function *Kernel; // stub function to help launch kernel
49
    const Decl *D;
50
  };
51
  llvm::SmallVector<KernelInfo, 16> EmittedKernels;
52
  // Map a device stub function to a symbol for identifying kernel in host code.
53
  // For CUDA, the symbol for identifying the kernel is the same as the device
54
  // stub function. For HIP, they are different.
55
  llvm::DenseMap<llvm::Function *, llvm::GlobalValue *> KernelHandles;
56
  // Map a kernel handle to the kernel stub.
57
  llvm::DenseMap<llvm::GlobalValue *, llvm::Function *> KernelStubs;
58
  struct VarInfo {
59
    llvm::GlobalVariable *Var;
60
    const VarDecl *D;
61
    DeviceVarFlags Flags;
62
  };
63
  llvm::SmallVector<VarInfo, 16> DeviceVars;
64
  /// Keeps track of variable containing handle of GPU binary. Populated by
65
  /// ModuleCtorFunction() and used to create corresponding cleanup calls in
66
  /// ModuleDtorFunction()
67
  llvm::GlobalVariable *GpuBinaryHandle = nullptr;
68
  /// Whether we generate relocatable device code.
69
  bool RelocatableDeviceCode;
70
  /// Mangle context for device.
71
  std::unique_ptr<MangleContext> DeviceMC;
72
73
  llvm::FunctionCallee getSetupArgumentFn() const;
74
  llvm::FunctionCallee getLaunchFn() const;
75
76
  llvm::FunctionType *getRegisterGlobalsFnTy() const;
77
  llvm::FunctionType *getCallbackFnTy() const;
78
  llvm::FunctionType *getRegisterLinkedBinaryFnTy() const;
79
  std::string addPrefixToName(StringRef FuncName) const;
80
  std::string addUnderscoredPrefixToName(StringRef FuncName) const;
81
82
  /// Creates a function to register all kernel stubs generated in this module.
83
  llvm::Function *makeRegisterGlobalsFn();
84
85
  /// Helper function that generates a constant string and returns a pointer to
86
  /// the start of the string.  The result of this function can be used anywhere
87
  /// where the C code specifies const char*.
88
  llvm::Constant *makeConstantString(const std::string &Str,
89
                                     const std::string &Name = "",
90
                                     const std::string &SectionName = "",
91
214
                                     unsigned Alignment = 0) {
92
214
    llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0),
93
214
                               llvm::ConstantInt::get(SizeTy, 0)};
94
214
    auto ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str());
95
214
    llvm::GlobalVariable *GV =
96
214
        cast<llvm::GlobalVariable>(ConstStr.getPointer());
97
214
    if (!SectionName.empty()) {
98
24
      GV->setSection(SectionName);
99
      // Mark the address as used which make sure that this section isn't
100
      // merged and we will really have it in the object file.
101
24
      GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
102
24
    }
103
214
    if (Alignment)
104
24
      GV->setAlignment(llvm::Align(Alignment));
105
106
214
    return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(),
107
214
                                                ConstStr.getPointer(), Zeros);
108
214
  }
109
110
  /// Helper function that generates an empty dummy function returning void.
111
4
  llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) {
112
4
    assert(FnTy->getReturnType()->isVoidTy() &&
113
4
           "Can only generate dummy functions returning void!");
114
0
    llvm::Function *DummyFunc = llvm::Function::Create(
115
4
        FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule);
116
117
4
    llvm::BasicBlock *DummyBlock =
118
4
        llvm::BasicBlock::Create(Context, "", DummyFunc);
119
4
    CGBuilderTy FuncBuilder(CGM, Context);
120
4
    FuncBuilder.SetInsertPoint(DummyBlock);
121
4
    FuncBuilder.CreateRetVoid();
122
123
4
    return DummyFunc;
124
4
  }
125
126
  void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
127
  void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
128
  std::string getDeviceSideName(const NamedDecl *ND) override;
129
130
  void registerDeviceVar(const VarDecl *VD, llvm::GlobalVariable &Var,
131
302
                         bool Extern, bool Constant) {
132
302
    DeviceVars.push_back({&Var,
133
302
                          VD,
134
302
                          {DeviceVarFlags::Variable, Extern, Constant,
135
302
                           VD->hasAttr<HIPManagedAttr>(),
136
302
                           /*Normalized*/ false, 0}});
137
302
  }
138
  void registerDeviceSurf(const VarDecl *VD, llvm::GlobalVariable &Var,
139
0
                          bool Extern, int Type) {
140
0
    DeviceVars.push_back({&Var,
141
0
                          VD,
142
0
                          {DeviceVarFlags::Surface, Extern, /*Constant*/ false,
143
0
                           /*Managed*/ false,
144
0
                           /*Normalized*/ false, Type}});
145
0
  }
146
  void registerDeviceTex(const VarDecl *VD, llvm::GlobalVariable &Var,
147
0
                         bool Extern, int Type, bool Normalized) {
148
0
    DeviceVars.push_back({&Var,
149
0
                          VD,
150
0
                          {DeviceVarFlags::Texture, Extern, /*Constant*/ false,
151
0
                           /*Managed*/ false, Normalized, Type}});
152
0
  }
153
154
  /// Creates module constructor function
155
  llvm::Function *makeModuleCtorFunction();
156
  /// Creates module destructor function
157
  llvm::Function *makeModuleDtorFunction();
158
  /// Transform managed variables for device compilation.
159
  void transformManagedVars();
160
  /// Create offloading entries to register globals in RDC mode.
161
  void createOffloadingEntries();
162
163
public:
164
  CGNVCUDARuntime(CodeGenModule &CGM);
165
166
  llvm::GlobalValue *getKernelHandle(llvm::Function *F, GlobalDecl GD) override;
167
59
  llvm::Function *getKernelStub(llvm::GlobalValue *Handle) override {
168
59
    auto Loc = KernelStubs.find(Handle);
169
59
    assert(Loc != KernelStubs.end());
170
0
    return Loc->second;
171
59
  }
172
  void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override;
173
  void handleVarRegistration(const VarDecl *VD,
174
                             llvm::GlobalVariable &Var) override;
175
  void
176
  internalizeDeviceSideVar(const VarDecl *D,
177
                           llvm::GlobalValue::LinkageTypes &Linkage) override;
178
179
  llvm::Function *finalizeModule() override;
180
};
181
182
} // end anonymous namespace
183
184
87
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
185
87
  if (CGM.getLangOpts().HIP)
186
62
    return ((Twine("hip") + Twine(FuncName)).str());
187
25
  return ((Twine("cuda") + Twine(FuncName)).str());
188
87
}
189
std::string
190
562
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
191
562
  if (CGM.getLangOpts().HIP)
192
454
    return ((Twine("__hip") + Twine(FuncName)).str());
193
108
  return ((Twine("__cuda") + Twine(FuncName)).str());
194
562
}
195
196
216
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
197
  // If the host and device have different C++ ABIs, mark it as the device
198
  // mangle context so that the mangling needs to retrieve the additional
199
  // device lambda mangling number instead of the regular host one.
200
216
  if (CGM.getContext().getAuxTargetInfo() &&
201
216
      
CGM.getContext().getTargetInfo().getCXXABI().isMicrosoft()30
&&
202
216
      
CGM.getContext().getAuxTargetInfo()->getCXXABI().isItaniumFamily()8
) {
203
8
    return std::unique_ptr<MangleContext>(
204
8
        CGM.getContext().createDeviceMangleContext(
205
8
            *CGM.getContext().getAuxTargetInfo()));
206
8
  }
207
208
208
  return std::unique_ptr<MangleContext>(CGM.getContext().createMangleContext(
209
208
      CGM.getContext().getAuxTargetInfo()));
210
216
}
211
212
CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
213
    : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()),
214
      TheModule(CGM.getModule()),
215
      RelocatableDeviceCode(CGM.getLangOpts().GPURelocatableDeviceCode),
216
216
      DeviceMC(InitDeviceMC(CGM)) {
217
216
  CodeGen::CodeGenTypes &Types = CGM.getTypes();
218
216
  ASTContext &Ctx = CGM.getContext();
219
220
216
  IntTy = CGM.IntTy;
221
216
  SizeTy = CGM.SizeTy;
222
216
  VoidTy = CGM.VoidTy;
223
224
216
  CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy));
225
216
  VoidPtrTy = cast<llvm::PointerType>(Types.ConvertType(Ctx.VoidPtrTy));
226
216
  VoidPtrPtrTy = VoidPtrTy->getPointerTo();
227
216
}
228
229
78
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
230
  // cudaError_t cudaSetupArgument(void *, size_t, size_t)
231
78
  llvm::Type *Params[] = {VoidPtrTy, SizeTy, SizeTy};
232
78
  return CGM.CreateRuntimeFunction(
233
78
      llvm::FunctionType::get(IntTy, Params, false),
234
78
      addPrefixToName("SetupArgument"));
235
78
}
236
237
78
llvm::FunctionCallee CGNVCUDARuntime::getLaunchFn() const {
238
78
  if (CGM.getLangOpts().HIP) {
239
    // hipError_t hipLaunchByPtr(char *);
240
59
    return CGM.CreateRuntimeFunction(
241
59
        llvm::FunctionType::get(IntTy, CharPtrTy, false), "hipLaunchByPtr");
242
59
  }
243
  // cudaError_t cudaLaunch(char *);
244
19
  return CGM.CreateRuntimeFunction(
245
19
      llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch");
246
78
}
247
248
55
llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const {
249
55
  return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false);
250
55
}
251
252
8
llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const {
253
8
  return llvm::FunctionType::get(VoidTy, VoidPtrTy, false);
254
8
}
255
256
4
llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const {
257
4
  auto *CallbackFnTy = getCallbackFnTy();
258
4
  auto *RegisterGlobalsFnTy = getRegisterGlobalsFnTy();
259
4
  llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy,
260
4
                          VoidPtrTy, CallbackFnTy->getPointerTo()};
261
4
  return llvm::FunctionType::get(VoidTy, Params, false);
262
4
}
263
264
310
std::string CGNVCUDARuntime::getDeviceSideName(const NamedDecl *ND) {
265
310
  GlobalDecl GD;
266
  // D could be either a kernel or a variable.
267
310
  if (auto *FD = dyn_cast<FunctionDecl>(ND))
268
188
    GD = GlobalDecl(FD, KernelReferenceKind::Kernel);
269
122
  else
270
122
    GD = GlobalDecl(ND);
271
310
  std::string DeviceSideName;
272
310
  MangleContext *MC;
273
310
  if (CGM.getLangOpts().CUDAIsDevice)
274
0
    MC = &CGM.getCXXABI().getMangleContext();
275
310
  else
276
310
    MC = DeviceMC.get();
277
310
  if (MC->shouldMangleDeclName(ND)) {
278
213
    SmallString<256> Buffer;
279
213
    llvm::raw_svector_ostream Out(Buffer);
280
213
    MC->mangleName(GD, Out);
281
213
    DeviceSideName = std::string(Out.str());
282
213
  } else
283
97
    DeviceSideName = std::string(ND->getIdentifier()->getName());
284
285
  // Make unique name for device side static file-scope variable for HIP.
286
310
  if (CGM.getContext().shouldExternalize(ND) &&
287
310
      
CGM.getLangOpts().GPURelocatableDeviceCode45
) {
288
19
    SmallString<256> Buffer;
289
19
    llvm::raw_svector_ostream Out(Buffer);
290
19
    Out << DeviceSideName;
291
19
    CGM.printPostfixForExternalizedDecl(Out, ND);
292
19
    DeviceSideName = std::string(Out.str());
293
19
  }
294
310
  return DeviceSideName;
295
310
}
296
297
void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
298
87
                                     FunctionArgList &Args) {
299
87
  EmittedKernels.push_back({CGF.CurFn, CGF.CurFuncDecl});
300
87
  if (auto *GV = dyn_cast<llvm::GlobalVariable>(KernelHandles[CGF.CurFn])) {
301
62
    GV->setLinkage(CGF.CurFn->getLinkage());
302
62
    GV->setInitializer(CGF.CurFn);
303
62
  }
304
87
  if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
305
87
                         CudaFeature::CUDA_USES_NEW_LAUNCH) ||
306
87
      
(81
CGF.getLangOpts().HIP81
&&
CGF.getLangOpts().HIPUseNewLaunchAPI62
))
307
9
    emitDeviceStubBodyNew(CGF, Args);
308
78
  else
309
78
    emitDeviceStubBodyLegacy(CGF, Args);
310
87
}
311
312
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
313
// array and kernels are launched using cudaLaunchKernel().
314
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
315
9
                                            FunctionArgList &Args) {
316
  // Build the shadow stack entry at the very start of the function.
317
318
  // Calculate amount of space we will need for all arguments.  If we have no
319
  // args, allocate a single pointer so we still have a valid pointer to the
320
  // argument array that we can pass to runtime, even if it will be unused.
321
9
  Address KernelArgs = CGF.CreateTempAlloca(
322
9
      VoidPtrTy, CharUnits::fromQuantity(16), "kernel_args",
323
9
      llvm::ConstantInt::get(SizeTy, std::max<size_t>(1, Args.size())));
324
  // Store pointers to the arguments in a locally allocated launch_args.
325
28
  for (unsigned i = 0; i < Args.size(); 
++i19
) {
326
19
    llvm::Value* VarPtr = CGF.GetAddrOfLocalVar(Args[i]).getPointer();
327
19
    llvm::Value *VoidVarPtr = CGF.Builder.CreatePointerCast(VarPtr, VoidPtrTy);
328
19
    CGF.Builder.CreateDefaultAlignedStore(
329
19
        VoidVarPtr,
330
19
        CGF.Builder.CreateConstGEP1_32(VoidPtrTy, KernelArgs.getPointer(), i));
331
19
  }
332
333
9
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
334
335
  // Lookup cudaLaunchKernel/hipLaunchKernel function.
336
  // HIP kernel launching API name depends on -fgpu-default-stream option. For
337
  // the default value 'legacy', it is hipLaunchKernel. For 'per-thread',
338
  // it is hipLaunchKernel_spt.
339
  // cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
340
  //                              void **args, size_t sharedMem,
341
  //                              cudaStream_t stream);
342
  // hipError_t hipLaunchKernel[_spt](const void *func, dim3 gridDim,
343
  //                                  dim3 blockDim, void **args,
344
  //                                  size_t sharedMem, hipStream_t stream);
345
9
  TranslationUnitDecl *TUDecl = CGM.getContext().getTranslationUnitDecl();
346
9
  DeclContext *DC = TranslationUnitDecl::castToDeclContext(TUDecl);
347
9
  std::string KernelLaunchAPI = "LaunchKernel";
348
9
  if (CGF.getLangOpts().HIP && CGF.getLangOpts().GPUDefaultStream ==
349
3
                                   LangOptions::GPUDefaultStreamKind::PerThread)
350
1
    KernelLaunchAPI = KernelLaunchAPI + "_spt";
351
9
  auto LaunchKernelName = addPrefixToName(KernelLaunchAPI);
352
9
  IdentifierInfo &cudaLaunchKernelII =
353
9
      CGM.getContext().Idents.get(LaunchKernelName);
354
9
  FunctionDecl *cudaLaunchKernelFD = nullptr;
355
9
  for (auto *Result : DC->lookup(&cudaLaunchKernelII)) {
356
9
    if (FunctionDecl *FD = dyn_cast<FunctionDecl>(Result))
357
9
      cudaLaunchKernelFD = FD;
358
9
  }
359
360
9
  if (cudaLaunchKernelFD == nullptr) {
361
0
    CGM.Error(CGF.CurFuncDecl->getLocation(),
362
0
              "Can't find declaration for " + LaunchKernelName);
363
0
    return;
364
0
  }
365
  // Create temporary dim3 grid_dim, block_dim.
366
9
  ParmVarDecl *GridDimParam = cudaLaunchKernelFD->getParamDecl(1);
367
9
  QualType Dim3Ty = GridDimParam->getType();
368
9
  Address GridDim =
369
9
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "grid_dim");
370
9
  Address BlockDim =
371
9
      CGF.CreateMemTemp(Dim3Ty, CharUnits::fromQuantity(8), "block_dim");
372
9
  Address ShmemSize =
373
9
      CGF.CreateTempAlloca(SizeTy, CGM.getSizeAlign(), "shmem_size");
374
9
  Address Stream =
375
9
      CGF.CreateTempAlloca(VoidPtrTy, CGM.getPointerAlign(), "stream");
376
9
  llvm::FunctionCallee cudaPopConfigFn = CGM.CreateRuntimeFunction(
377
9
      llvm::FunctionType::get(IntTy,
378
9
                              {/*gridDim=*/GridDim.getType(),
379
9
                               /*blockDim=*/BlockDim.getType(),
380
9
                               /*ShmemSize=*/ShmemSize.getType(),
381
9
                               /*Stream=*/Stream.getType()},
382
9
                              /*isVarArg=*/false),
383
9
      addUnderscoredPrefixToName("PopCallConfiguration"));
384
385
9
  CGF.EmitRuntimeCallOrInvoke(cudaPopConfigFn,
386
9
                              {GridDim.getPointer(), BlockDim.getPointer(),
387
9
                               ShmemSize.getPointer(), Stream.getPointer()});
388
389
  // Emit the call to cudaLaunch
390
9
  llvm::Value *Kernel =
391
9
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], VoidPtrTy);
392
9
  CallArgList LaunchKernelArgs;
393
9
  LaunchKernelArgs.add(RValue::get(Kernel),
394
9
                       cudaLaunchKernelFD->getParamDecl(0)->getType());
395
9
  LaunchKernelArgs.add(RValue::getAggregate(GridDim), Dim3Ty);
396
9
  LaunchKernelArgs.add(RValue::getAggregate(BlockDim), Dim3Ty);
397
9
  LaunchKernelArgs.add(RValue::get(KernelArgs.getPointer()),
398
9
                       cudaLaunchKernelFD->getParamDecl(3)->getType());
399
9
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(ShmemSize)),
400
9
                       cudaLaunchKernelFD->getParamDecl(4)->getType());
401
9
  LaunchKernelArgs.add(RValue::get(CGF.Builder.CreateLoad(Stream)),
402
9
                       cudaLaunchKernelFD->getParamDecl(5)->getType());
403
404
9
  QualType QT = cudaLaunchKernelFD->getType();
405
9
  QualType CQT = QT.getCanonicalType();
406
9
  llvm::Type *Ty = CGM.getTypes().ConvertType(CQT);
407
9
  llvm::FunctionType *FTy = cast<llvm::FunctionType>(Ty);
408
409
9
  const CGFunctionInfo &FI =
410
9
      CGM.getTypes().arrangeFunctionDeclaration(cudaLaunchKernelFD);
411
9
  llvm::FunctionCallee cudaLaunchKernelFn =
412
9
      CGM.CreateRuntimeFunction(FTy, LaunchKernelName);
413
9
  CGF.EmitCall(FI, CGCallee::forDirect(cudaLaunchKernelFn), ReturnValueSlot(),
414
9
               LaunchKernelArgs);
415
9
  CGF.EmitBranch(EndBlock);
416
417
9
  CGF.EmitBlock(EndBlock);
418
9
}
419
420
void CGNVCUDARuntime::emitDeviceStubBodyLegacy(CodeGenFunction &CGF,
421
78
                                               FunctionArgList &Args) {
422
  // Emit a call to cudaSetupArgument for each arg in Args.
423
78
  llvm::FunctionCallee cudaSetupArgFn = getSetupArgumentFn();
424
78
  llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
425
78
  CharUnits Offset = CharUnits::Zero();
426
82
  for (const VarDecl *A : Args) {
427
82
    auto TInfo = CGM.getContext().getTypeInfoInChars(A->getType());
428
82
    Offset = Offset.alignTo(TInfo.Align);
429
82
    llvm::Value *Args[] = {
430
82
        CGF.Builder.CreatePointerCast(CGF.GetAddrOfLocalVar(A).getPointer(),
431
82
                                      VoidPtrTy),
432
82
        llvm::ConstantInt::get(SizeTy, TInfo.Width.getQuantity()),
433
82
        llvm::ConstantInt::get(SizeTy, Offset.getQuantity()),
434
82
    };
435
82
    llvm::CallBase *CB = CGF.EmitRuntimeCallOrInvoke(cudaSetupArgFn, Args);
436
82
    llvm::Constant *Zero = llvm::ConstantInt::get(IntTy, 0);
437
82
    llvm::Value *CBZero = CGF.Builder.CreateICmpEQ(CB, Zero);
438
82
    llvm::BasicBlock *NextBlock = CGF.createBasicBlock("setup.next");
439
82
    CGF.Builder.CreateCondBr(CBZero, NextBlock, EndBlock);
440
82
    CGF.EmitBlock(NextBlock);
441
82
    Offset += TInfo.Width;
442
82
  }
443
444
  // Emit the call to cudaLaunch
445
78
  llvm::FunctionCallee cudaLaunchFn = getLaunchFn();
446
78
  llvm::Value *Arg =
447
78
      CGF.Builder.CreatePointerCast(KernelHandles[CGF.CurFn], CharPtrTy);
448
78
  CGF.EmitRuntimeCallOrInvoke(cudaLaunchFn, Arg);
449
78
  CGF.EmitBranch(EndBlock);
450
451
78
  CGF.EmitBlock(EndBlock);
452
78
}
453
454
// Replace the original variable Var with the address loaded from variable
455
// ManagedVar populated by HIP runtime.
456
static void replaceManagedVar(llvm::GlobalVariable *Var,
457
34
                              llvm::GlobalVariable *ManagedVar) {
458
34
  SmallVector<SmallVector<llvm::User *, 8>, 8> WorkList;
459
40
  for (auto &&VarUse : Var->uses()) {
460
40
    WorkList.push_back({VarUse.getUser()});
461
40
  }
462
120
  while (!WorkList.empty()) {
463
86
    auto &&WorkItem = WorkList.pop_back_val();
464
86
    auto *U = WorkItem.back();
465
86
    if (isa<llvm::ConstantExpr>(U)) {
466
46
      for (auto &&UU : U->uses()) {
467
46
        WorkItem.push_back(UU.getUser());
468
46
        WorkList.push_back(WorkItem);
469
46
        WorkItem.pop_back();
470
46
      }
471
40
      continue;
472
40
    }
473
46
    if (auto *I = dyn_cast<llvm::Instruction>(U)) {
474
46
      llvm::Value *OldV = Var;
475
46
      llvm::Instruction *NewV =
476
46
          new llvm::LoadInst(Var->getType(), ManagedVar, "ld.managed", false,
477
46
                             llvm::Align(Var->getAlignment()), I);
478
46
      WorkItem.pop_back();
479
      // Replace constant expressions directly or indirectly using the managed
480
      // variable with instructions.
481
46
      for (auto &&Op : WorkItem) {
482
44
        auto *CE = cast<llvm::ConstantExpr>(Op);
483
44
        auto *NewInst = CE->getAsInstruction(I);
484
44
        NewInst->replaceUsesOfWith(OldV, NewV);
485
44
        OldV = CE;
486
44
        NewV = NewInst;
487
44
      }
488
46
      I->replaceUsesOfWith(OldV, NewV);
489
46
    } else {
490
0
      llvm_unreachable("Invalid use of managed variable");
491
0
    }
492
46
  }
493
34
}
494
495
/// Creates a function that sets up state on the host side for CUDA objects that
496
/// have a presence on both the host and device sides. Specifically, registers
497
/// the host side of kernel functions and device global variables with the CUDA
498
/// runtime.
499
/// \code
500
/// void __cuda_register_globals(void** GpuBinaryHandle) {
501
///    __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...);
502
///    ...
503
///    __cudaRegisterFunction(GpuBinaryHandle,KernelM,...);
504
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...);
505
///    ...
506
///    __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...);
507
/// }
508
/// \endcode
509
51
llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() {
510
  // No need to register anything
511
51
  if (EmittedKernels.empty() && 
DeviceVars.empty()5
)
512
0
    return nullptr;
513
514
51
  llvm::Function *RegisterKernelsFunc = llvm::Function::Create(
515
51
      getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage,
516
51
      addUnderscoredPrefixToName("_register_globals"), &TheModule);
517
51
  llvm::BasicBlock *EntryBB =
518
51
      llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc);
519
51
  CGBuilderTy Builder(CGM, Context);
520
51
  Builder.SetInsertPoint(EntryBB);
521
522
  // void __cudaRegisterFunction(void **, const char *, char *, const char *,
523
  //                             int, uint3*, uint3*, dim3*, dim3*, int*)
524
51
  llvm::Type *RegisterFuncParams[] = {
525
51
      VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy,
526
51
      VoidPtrTy,    VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()};
527
51
  llvm::FunctionCallee RegisterFunc = CGM.CreateRuntimeFunction(
528
51
      llvm::FunctionType::get(IntTy, RegisterFuncParams, false),
529
51
      addUnderscoredPrefixToName("RegisterFunction"));
530
531
  // Extract GpuBinaryHandle passed as the first argument passed to
532
  // __cuda_register_globals() and generate __cudaRegisterFunction() call for
533
  // each emitted kernel.
534
51
  llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin();
535
72
  for (auto &&I : EmittedKernels) {
536
72
    llvm::Constant *KernelName =
537
72
        makeConstantString(getDeviceSideName(cast<NamedDecl>(I.D)));
538
72
    llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy);
539
72
    llvm::Value *Args[] = {
540
72
        &GpuBinaryHandlePtr,
541
72
        Builder.CreateBitCast(KernelHandles[I.Kernel], VoidPtrTy),
542
72
        KernelName,
543
72
        KernelName,
544
72
        llvm::ConstantInt::get(IntTy, -1),
545
72
        NullPtr,
546
72
        NullPtr,
547
72
        NullPtr,
548
72
        NullPtr,
549
72
        llvm::ConstantPointerNull::get(IntTy->getPointerTo())};
550
72
    Builder.CreateCall(RegisterFunc, Args);
551
72
  }
552
553
51
  llvm::Type *VarSizeTy = IntTy;
554
  // For HIP or CUDA 9.0+, device variable size is type of `size_t`.
555
51
  if (CGM.getLangOpts().HIP ||
556
51
      
ToCudaVersion(CGM.getTarget().getSDKVersion()) >= CudaVersion::CUDA_9010
)
557
45
    VarSizeTy = SizeTy;
558
559
  // void __cudaRegisterVar(void **, char *, char *, const char *,
560
  //                        int, int, int, int)
561
51
  llvm::Type *RegisterVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
562
51
                                     CharPtrTy,    IntTy,     VarSizeTy,
563
51
                                     IntTy,        IntTy};
564
51
  llvm::FunctionCallee RegisterVar = CGM.CreateRuntimeFunction(
565
51
      llvm::FunctionType::get(VoidTy, RegisterVarParams, false),
566
51
      addUnderscoredPrefixToName("RegisterVar"));
567
  // void __hipRegisterManagedVar(void **, char *, char *, const char *,
568
  //                              size_t, unsigned)
569
51
  llvm::Type *RegisterManagedVarParams[] = {VoidPtrPtrTy, CharPtrTy, CharPtrTy,
570
51
                                            CharPtrTy,    VarSizeTy, IntTy};
571
51
  llvm::FunctionCallee RegisterManagedVar = CGM.CreateRuntimeFunction(
572
51
      llvm::FunctionType::get(VoidTy, RegisterManagedVarParams, false),
573
51
      addUnderscoredPrefixToName("RegisterManagedVar"));
574
  // void __cudaRegisterSurface(void **, const struct surfaceReference *,
575
  //                            const void **, const char *, int, int);
576
51
  llvm::FunctionCallee RegisterSurf = CGM.CreateRuntimeFunction(
577
51
      llvm::FunctionType::get(
578
51
          VoidTy, {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy},
579
51
          false),
580
51
      addUnderscoredPrefixToName("RegisterSurface"));
581
  // void __cudaRegisterTexture(void **, const struct textureReference *,
582
  //                            const void **, const char *, int, int, int)
583
51
  llvm::FunctionCallee RegisterTex = CGM.CreateRuntimeFunction(
584
51
      llvm::FunctionType::get(
585
51
          VoidTy,
586
51
          {VoidPtrPtrTy, VoidPtrTy, CharPtrTy, CharPtrTy, IntTy, IntTy, IntTy},
587
51
          false),
588
51
      addUnderscoredPrefixToName("RegisterTexture"));
589
118
  for (auto &&Info : DeviceVars) {
590
118
    llvm::GlobalVariable *Var = Info.Var;
591
118
    assert((!Var->isDeclaration() || Info.Flags.isManaged()) &&
592
118
           "External variables should not show up here, except HIP managed "
593
118
           "variables");
594
0
    llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D));
595
118
    switch (Info.Flags.getKind()) {
596
118
    case DeviceVarFlags::Variable: {
597
118
      uint64_t VarSize =
598
118
          CGM.getDataLayout().getTypeAllocSize(Var->getValueType());
599
118
      if (Info.Flags.isManaged()) {
600
16
        auto *ManagedVar = new llvm::GlobalVariable(
601
16
            CGM.getModule(), Var->getType(),
602
16
            /*isConstant=*/false, Var->getLinkage(),
603
16
            /*Init=*/Var->isDeclaration()
604
16
                ? 
nullptr4
605
16
                : 
llvm::ConstantPointerNull::get(Var->getType())12
,
606
16
            /*Name=*/"", /*InsertBefore=*/nullptr,
607
16
            llvm::GlobalVariable::NotThreadLocal);
608
16
        ManagedVar->setDSOLocal(Var->isDSOLocal());
609
16
        ManagedVar->setVisibility(Var->getVisibility());
610
16
        ManagedVar->setExternallyInitialized(true);
611
16
        ManagedVar->takeName(Var);
612
16
        Var->setName(Twine(ManagedVar->getName() + ".managed"));
613
16
        replaceManagedVar(Var, ManagedVar);
614
16
        llvm::Value *Args[] = {
615
16
            &GpuBinaryHandlePtr,
616
16
            Builder.CreateBitCast(ManagedVar, VoidPtrTy),
617
16
            Builder.CreateBitCast(Var, VoidPtrTy),
618
16
            VarName,
619
16
            llvm::ConstantInt::get(VarSizeTy, VarSize),
620
16
            llvm::ConstantInt::get(IntTy, Var->getAlignment())};
621
16
        if (!Var->isDeclaration())
622
12
          Builder.CreateCall(RegisterManagedVar, Args);
623
102
      } else {
624
102
        llvm::Value *Args[] = {
625
102
            &GpuBinaryHandlePtr,
626
102
            Builder.CreateBitCast(Var, VoidPtrTy),
627
102
            VarName,
628
102
            VarName,
629
102
            llvm::ConstantInt::get(IntTy, Info.Flags.isExtern()),
630
102
            llvm::ConstantInt::get(VarSizeTy, VarSize),
631
102
            llvm::ConstantInt::get(IntTy, Info.Flags.isConstant()),
632
102
            llvm::ConstantInt::get(IntTy, 0)};
633
102
        Builder.CreateCall(RegisterVar, Args);
634
102
      }
635
118
      break;
636
0
    }
637
0
    case DeviceVarFlags::Surface:
638
0
      Builder.CreateCall(
639
0
          RegisterSurf,
640
0
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
641
0
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
642
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
643
0
      break;
644
0
    case DeviceVarFlags::Texture:
645
0
      Builder.CreateCall(
646
0
          RegisterTex,
647
0
          {&GpuBinaryHandlePtr, Builder.CreateBitCast(Var, VoidPtrTy), VarName,
648
0
           VarName, llvm::ConstantInt::get(IntTy, Info.Flags.getSurfTexType()),
649
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isNormalized()),
650
0
           llvm::ConstantInt::get(IntTy, Info.Flags.isExtern())});
651
0
      break;
652
118
    }
653
118
  }
654
655
51
  Builder.CreateRetVoid();
656
51
  return RegisterKernelsFunc;
657
51
}
658
659
/// Creates a global constructor function for the module:
660
///
661
/// For CUDA:
662
/// \code
663
/// void __cuda_module_ctor() {
664
///     Handle = __cudaRegisterFatBinary(GpuBinaryBlob);
665
///     __cuda_register_globals(Handle);
666
/// }
667
/// \endcode
668
///
669
/// For HIP:
670
/// \code
671
/// void __hip_module_ctor() {
672
///     if (__hip_gpubin_handle == 0) {
673
///         __hip_gpubin_handle  = __hipRegisterFatBinary(GpuBinaryBlob);
674
///         __hip_register_globals(__hip_gpubin_handle);
675
///     }
676
/// }
677
/// \endcode
678
71
llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
679
71
  bool IsHIP = CGM.getLangOpts().HIP;
680
71
  bool IsCUDA = CGM.getLangOpts().CUDA;
681
  // No need to generate ctors/dtors if there is no GPU binary.
682
71
  StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName;
683
71
  if (CudaGpuBinaryFileName.empty() && 
!IsHIP48
)
684
13
    return nullptr;
685
58
  if ((IsHIP || 
(12
IsCUDA12
&&
!RelocatableDeviceCode12
)) &&
EmittedKernels.empty()54
&&
686
58
      
DeviceVars.empty()12
)
687
7
    return nullptr;
688
689
  // void __{cuda|hip}_register_globals(void* handle);
690
51
  llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn();
691
  // We always need a function to pass in as callback. Create a dummy
692
  // implementation if we don't need to register anything.
693
51
  if (RelocatableDeviceCode && 
!RegisterGlobalsFunc12
)
694
0
    RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy());
695
696
  // void ** __{cuda|hip}RegisterFatBinary(void *);
697
51
  llvm::FunctionCallee RegisterFatbinFunc = CGM.CreateRuntimeFunction(
698
51
      llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false),
699
51
      addUnderscoredPrefixToName("RegisterFatBinary"));
700
  // struct { int magic, int version, void * gpu_binary, void * dont_care };
701
51
  llvm::StructType *FatbinWrapperTy =
702
51
      llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy);
703
704
  // Register GPU binary with the CUDA runtime, store returned handle in a
705
  // global variable and save a reference in GpuBinaryHandle to be cleaned up
706
  // in destructor on exit. Then associate all known kernels with the GPU binary
707
  // handle so CUDA runtime can figure out what to call on the GPU side.
708
51
  std::unique_ptr<llvm::MemoryBuffer> CudaGpuBinary = nullptr;
709
51
  if (!CudaGpuBinaryFileName.empty()) {
710
20
    llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> CudaGpuBinaryOrErr =
711
20
        llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName);
712
20
    if (std::error_code EC = CudaGpuBinaryOrErr.getError()) {
713
0
      CGM.getDiags().Report(diag::err_cannot_open_file)
714
0
          << CudaGpuBinaryFileName << EC.message();
715
0
      return nullptr;
716
0
    }
717
20
    CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get());
718
20
  }
719
720
51
  llvm::Function *ModuleCtorFunc = llvm::Function::Create(
721
51
      llvm::FunctionType::get(VoidTy, false),
722
51
      llvm::GlobalValue::InternalLinkage,
723
51
      addUnderscoredPrefixToName("_module_ctor"), &TheModule);
724
51
  llvm::BasicBlock *CtorEntryBB =
725
51
      llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc);
726
51
  CGBuilderTy CtorBuilder(CGM, Context);
727
728
51
  CtorBuilder.SetInsertPoint(CtorEntryBB);
729
730
51
  const char *FatbinConstantName;
731
51
  const char *FatbinSectionName;
732
51
  const char *ModuleIDSectionName;
733
51
  StringRef ModuleIDPrefix;
734
51
  llvm::Constant *FatBinStr;
735
51
  unsigned FatMagic;
736
51
  if (IsHIP) {
737
41
    FatbinConstantName = ".hip_fatbin";
738
41
    FatbinSectionName = ".hipFatBinSegment";
739
740
41
    ModuleIDSectionName = "__hip_module_id";
741
41
    ModuleIDPrefix = "__hip_";
742
743
41
    if (CudaGpuBinary) {
744
      // If fatbin is available from early finalization, create a string
745
      // literal containing the fat binary loaded from the given file.
746
10
      const unsigned HIPCodeObjectAlign = 4096;
747
10
      FatBinStr =
748
10
          makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
749
10
                             FatbinConstantName, HIPCodeObjectAlign);
750
31
    } else {
751
      // If fatbin is not available, create an external symbol
752
      // __hip_fatbin in section .hip_fatbin. The external symbol is supposed
753
      // to contain the fat binary but will be populated somewhere else,
754
      // e.g. by lld through link script.
755
31
      FatBinStr = new llvm::GlobalVariable(
756
31
        CGM.getModule(), CGM.Int8Ty,
757
31
        /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr,
758
31
        "__hip_fatbin", nullptr,
759
31
        llvm::GlobalVariable::NotThreadLocal);
760
31
      cast<llvm::GlobalVariable>(FatBinStr)->setSection(FatbinConstantName);
761
31
    }
762
763
41
    FatMagic = HIPFatMagic;
764
41
  } else {
765
10
    if (RelocatableDeviceCode)
766
4
      FatbinConstantName = CGM.getTriple().isMacOSX()
767
4
                               ? 
"__NV_CUDA,__nv_relfatbin"0
768
4
                               : "__nv_relfatbin";
769
6
    else
770
6
      FatbinConstantName =
771
6
          CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__nv_fatbin"0
: ".nv_fatbin";
772
    // NVIDIA's cuobjdump looks for fatbins in this section.
773
10
    FatbinSectionName =
774
10
        CGM.getTriple().isMacOSX() ? 
"__NV_CUDA,__fatbin"0
: ".nvFatBinSegment";
775
776
10
    ModuleIDSectionName = CGM.getTriple().isMacOSX()
777
10
                              ? 
"__NV_CUDA,__nv_module_id"0
778
10
                              : "__nv_module_id";
779
10
    ModuleIDPrefix = "__nv_";
780
781
    // For CUDA, create a string literal containing the fat binary loaded from
782
    // the given file.
783
10
    FatBinStr = makeConstantString(std::string(CudaGpuBinary->getBuffer()), "",
784
10
                                   FatbinConstantName, 8);
785
10
    FatMagic = CudaFatMagic;
786
10
  }
787
788
  // Create initialized wrapper structure that points to the loaded GPU binary
789
51
  ConstantInitBuilder Builder(CGM);
790
51
  auto Values = Builder.beginStruct(FatbinWrapperTy);
791
  // Fatbin wrapper magic.
792
51
  Values.addInt(IntTy, FatMagic);
793
  // Fatbin version.
794
51
  Values.addInt(IntTy, 1);
795
  // Data.
796
51
  Values.add(FatBinStr);
797
  // Unused in fatbin v1.
798
51
  Values.add(llvm::ConstantPointerNull::get(VoidPtrTy));
799
51
  llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal(
800
51
      addUnderscoredPrefixToName("_fatbin_wrapper"), CGM.getPointerAlign(),
801
51
      /*constant*/ true);
802
51
  FatbinWrapper->setSection(FatbinSectionName);
803
804
  // There is only one HIP fat binary per linked module, however there are
805
  // multiple constructor functions. Make sure the fat binary is registered
806
  // only once. The constructor functions are executed by the dynamic loader
807
  // before the program gains control. The dynamic loader cannot execute the
808
  // constructor functions concurrently since doing that would not guarantee
809
  // thread safety of the loaded program. Therefore we can assume sequential
810
  // execution of constructor functions here.
811
51
  if (IsHIP) {
812
41
    auto Linkage = CudaGpuBinary ? 
llvm::GlobalValue::InternalLinkage10
:
813
41
        
llvm::GlobalValue::LinkOnceAnyLinkage31
;
814
41
    llvm::BasicBlock *IfBlock =
815
41
        llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc);
816
41
    llvm::BasicBlock *ExitBlock =
817
41
        llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc);
818
    // The name, size, and initialization pattern of this variable is part
819
    // of HIP ABI.
820
41
    GpuBinaryHandle = new llvm::GlobalVariable(
821
41
        TheModule, VoidPtrPtrTy, /*isConstant=*/false,
822
41
        Linkage,
823
41
        /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy),
824
41
        "__hip_gpubin_handle");
825
41
    if (Linkage == llvm::GlobalValue::LinkOnceAnyLinkage)
826
31
      GpuBinaryHandle->setComdat(
827
31
          CGM.getModule().getOrInsertComdat(GpuBinaryHandle->getName()));
828
41
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
829
    // Prevent the weak symbol in different shared libraries being merged.
830
41
    if (Linkage != llvm::GlobalValue::InternalLinkage)
831
31
      GpuBinaryHandle->setVisibility(llvm::GlobalValue::HiddenVisibility);
832
41
    Address GpuBinaryAddr(
833
41
        GpuBinaryHandle, VoidPtrPtrTy,
834
41
        CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
835
41
    {
836
41
      auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
837
41
      llvm::Constant *Zero =
838
41
          llvm::Constant::getNullValue(HandleValue->getType());
839
41
      llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero);
840
41
      CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock);
841
41
    }
842
41
    {
843
41
      CtorBuilder.SetInsertPoint(IfBlock);
844
      // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper);
845
41
      llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
846
41
          RegisterFatbinFunc,
847
41
          CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
848
41
      CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr);
849
41
      CtorBuilder.CreateBr(ExitBlock);
850
41
    }
851
41
    {
852
41
      CtorBuilder.SetInsertPoint(ExitBlock);
853
      // Call __hip_register_globals(GpuBinaryHandle);
854
41
      if (RegisterGlobalsFunc) {
855
41
        auto *HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr);
856
41
        CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue);
857
41
      }
858
41
    }
859
41
  } else 
if (10
!RelocatableDeviceCode10
) {
860
    // Register binary with CUDA runtime. This is substantially different in
861
    // default mode vs. separate compilation!
862
    // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper);
863
6
    llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall(
864
6
        RegisterFatbinFunc,
865
6
        CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy));
866
6
    GpuBinaryHandle = new llvm::GlobalVariable(
867
6
        TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage,
868
6
        llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle");
869
6
    GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getAsAlign());
870
6
    CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle,
871
6
                                   CGM.getPointerAlign());
872
873
    // Call __cuda_register_globals(GpuBinaryHandle);
874
6
    if (RegisterGlobalsFunc)
875
6
      CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall);
876
877
    // Call __cudaRegisterFatBinaryEnd(Handle) if this CUDA version needs it.
878
6
    if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
879
6
                           CudaFeature::CUDA_USES_FATBIN_REGISTER_END)) {
880
      // void __cudaRegisterFatBinaryEnd(void **);
881
0
      llvm::FunctionCallee RegisterFatbinEndFunc = CGM.CreateRuntimeFunction(
882
0
          llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
883
0
          "__cudaRegisterFatBinaryEnd");
884
0
      CtorBuilder.CreateCall(RegisterFatbinEndFunc, RegisterFatbinCall);
885
0
    }
886
6
  } else {
887
    // Generate a unique module ID.
888
4
    SmallString<64> ModuleID;
889
4
    llvm::raw_svector_ostream OS(ModuleID);
890
4
    OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
891
4
    llvm::Constant *ModuleIDConstant = makeConstantString(
892
4
        std::string(ModuleID.str()), "", ModuleIDSectionName, 32);
893
894
    // Create an alias for the FatbinWrapper that nvcc will look for.
895
4
    llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
896
4
                              Twine("__fatbinwrap") + ModuleID, FatbinWrapper);
897
898
    // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *,
899
    // void *, void (*)(void **))
900
4
    SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary");
901
4
    RegisterLinkedBinaryName += ModuleID;
902
4
    llvm::FunctionCallee RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction(
903
4
        getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName);
904
905
4
    assert(RegisterGlobalsFunc && "Expecting at least dummy function!");
906
0
    llvm::Value *Args[] = {RegisterGlobalsFunc,
907
4
                           CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy),
908
4
                           ModuleIDConstant,
909
4
                           makeDummyFunction(getCallbackFnTy())};
910
4
    CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args);
911
4
  }
912
913
  // Create destructor and register it with atexit() the way NVCC does it. Doing
914
  // it during regular destructor phase worked in CUDA before 9.2 but results in
915
  // double-free in 9.2.
916
51
  if (llvm::Function *CleanupFn = makeModuleDtorFunction()) {
917
    // extern "C" int atexit(void (*f)(void));
918
47
    llvm::FunctionType *AtExitTy =
919
47
        llvm::FunctionType::get(IntTy, CleanupFn->getType(), false);
920
47
    llvm::FunctionCallee AtExitFunc =
921
47
        CGM.CreateRuntimeFunction(AtExitTy, "atexit", llvm::AttributeList(),
922
47
                                  /*Local=*/true);
923
47
    CtorBuilder.CreateCall(AtExitFunc, CleanupFn);
924
47
  }
925
926
51
  CtorBuilder.CreateRetVoid();
927
51
  return ModuleCtorFunc;
928
51
}
929
930
/// Creates a global destructor function that unregisters the GPU code blob
931
/// registered by constructor.
932
///
933
/// For CUDA:
934
/// \code
935
/// void __cuda_module_dtor() {
936
///     __cudaUnregisterFatBinary(Handle);
937
/// }
938
/// \endcode
939
///
940
/// For HIP:
941
/// \code
942
/// void __hip_module_dtor() {
943
///     if (__hip_gpubin_handle) {
944
///         __hipUnregisterFatBinary(__hip_gpubin_handle);
945
///         __hip_gpubin_handle = 0;
946
///     }
947
/// }
948
/// \endcode
949
51
llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() {
950
  // No need for destructor if we don't have a handle to unregister.
951
51
  if (!GpuBinaryHandle)
952
4
    return nullptr;
953
954
  // void __cudaUnregisterFatBinary(void ** handle);
955
47
  llvm::FunctionCallee UnregisterFatbinFunc = CGM.CreateRuntimeFunction(
956
47
      llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false),
957
47
      addUnderscoredPrefixToName("UnregisterFatBinary"));
958
959
47
  llvm::Function *ModuleDtorFunc = llvm::Function::Create(
960
47
      llvm::FunctionType::get(VoidTy, false),
961
47
      llvm::GlobalValue::InternalLinkage,
962
47
      addUnderscoredPrefixToName("_module_dtor"), &TheModule);
963
964
47
  llvm::BasicBlock *DtorEntryBB =
965
47
      llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc);
966
47
  CGBuilderTy DtorBuilder(CGM, Context);
967
47
  DtorBuilder.SetInsertPoint(DtorEntryBB);
968
969
47
  Address GpuBinaryAddr(
970
47
      GpuBinaryHandle, GpuBinaryHandle->getValueType(),
971
47
      CharUnits::fromQuantity(GpuBinaryHandle->getAlignment()));
972
47
  auto *HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr);
973
  // There is only one HIP fat binary per linked module, however there are
974
  // multiple destructor functions. Make sure the fat binary is unregistered
975
  // only once.
976
47
  if (CGM.getLangOpts().HIP) {
977
41
    llvm::BasicBlock *IfBlock =
978
41
        llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc);
979
41
    llvm::BasicBlock *ExitBlock =
980
41
        llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc);
981
41
    llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType());
982
41
    llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero);
983
41
    DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock);
984
985
41
    DtorBuilder.SetInsertPoint(IfBlock);
986
41
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
987
41
    DtorBuilder.CreateStore(Zero, GpuBinaryAddr);
988
41
    DtorBuilder.CreateBr(ExitBlock);
989
990
41
    DtorBuilder.SetInsertPoint(ExitBlock);
991
41
  } else {
992
6
    DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue);
993
6
  }
994
47
  DtorBuilder.CreateRetVoid();
995
47
  return ModuleDtorFunc;
996
51
}
997
998
216
CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) {
999
216
  return new CGNVCUDARuntime(CGM);
1000
216
}
1001
1002
void CGNVCUDARuntime::internalizeDeviceSideVar(
1003
186
    const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) {
1004
  // For -fno-gpu-rdc, host-side shadows of external declarations of device-side
1005
  // global variables become internal definitions. These have to be internal in
1006
  // order to prevent name conflicts with global host variables with the same
1007
  // name in a different TUs.
1008
  //
1009
  // For -fgpu-rdc, the shadow variables should not be internalized because
1010
  // they may be accessed by different TU.
1011
186
  if (CGM.getLangOpts().GPURelocatableDeviceCode)
1012
54
    return;
1013
1014
  // __shared__ variables are odd. Shadows do get created, but
1015
  // they are not registered with the CUDA runtime, so they
1016
  // can't really be used to access their device-side
1017
  // counterparts. It's not clear yet whether it's nvcc's bug or
1018
  // a feature, but we've got to do the same for compatibility.
1019
132
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()74
||
1020
132
      
D->hasAttr<CUDASharedAttr>()43
||
1021
132
      
D->getType()->isCUDADeviceBuiltinSurfaceType()33
||
1022
132
      
D->getType()->isCUDADeviceBuiltinTextureType()33
) {
1023
99
    Linkage = llvm::GlobalValue::InternalLinkage;
1024
99
  }
1025
132
}
1026
1027
void CGNVCUDARuntime::handleVarRegistration(const VarDecl *D,
1028
371
                                            llvm::GlobalVariable &GV) {
1029
371
  if (D->hasAttr<CUDADeviceAttr>() || 
D->hasAttr<CUDAConstantAttr>()189
) {
1030
    // Shadow variables and their properties must be registered with CUDA
1031
    // runtime. Skip Extern global variables, which will be registered in
1032
    // the TU where they are defined.
1033
    //
1034
    // Don't register a C++17 inline variable. The local symbol can be
1035
    // discarded and referencing a discarded local symbol from outside the
1036
    // comdat (__cuda_register_globals) is disallowed by the ELF spec.
1037
    //
1038
    // HIP managed variables need to be always recorded in device and host
1039
    // compilations for transformation.
1040
    //
1041
    // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are
1042
    // added to llvm.compiler-used, therefore they are safe to be registered.
1043
306
    if ((!D->hasExternalStorage() && 
!D->isInline()298
) ||
1044
306
        
CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D)24
||
1045
306
        
D->hasAttr<HIPManagedAttr>()12
) {
1046
302
      registerDeviceVar(D, GV, !D->hasDefinition(),
1047
302
                        D->hasAttr<CUDAConstantAttr>());
1048
302
    }
1049
306
  } else 
if (65
D->getType()->isCUDADeviceBuiltinSurfaceType()65
||
1050
65
             D->getType()->isCUDADeviceBuiltinTextureType()) {
1051
    // Builtin surfaces and textures and their template arguments are
1052
    // also registered with CUDA runtime.
1053
0
    const auto *TD = cast<ClassTemplateSpecializationDecl>(
1054
0
        D->getType()->castAs<RecordType>()->getDecl());
1055
0
    const TemplateArgumentList &Args = TD->getTemplateArgs();
1056
0
    if (TD->hasAttr<CUDADeviceBuiltinSurfaceTypeAttr>()) {
1057
0
      assert(Args.size() == 2 &&
1058
0
             "Unexpected number of template arguments of CUDA device "
1059
0
             "builtin surface type.");
1060
0
      auto SurfType = Args[1].getAsIntegral();
1061
0
      if (!D->hasExternalStorage())
1062
0
        registerDeviceSurf(D, GV, !D->hasDefinition(), SurfType.getSExtValue());
1063
0
    } else {
1064
0
      assert(Args.size() == 3 &&
1065
0
             "Unexpected number of template arguments of CUDA device "
1066
0
             "builtin texture type.");
1067
0
      auto TexType = Args[1].getAsIntegral();
1068
0
      auto Normalized = Args[2].getAsIntegral();
1069
0
      if (!D->hasExternalStorage())
1070
0
        registerDeviceTex(D, GV, !D->hasDefinition(), TexType.getSExtValue(),
1071
0
                          Normalized.getZExtValue());
1072
0
    }
1073
0
  }
1074
371
}
1075
1076
// Transform managed variables to pointers to managed variables in device code.
1077
// Each use of the original managed variable is replaced by a load from the
1078
// transformed managed variable. The transformed managed variable contains
1079
// the address of managed memory which will be allocated by the runtime.
1080
128
void CGNVCUDARuntime::transformManagedVars() {
1081
171
  for (auto &&Info : DeviceVars) {
1082
171
    llvm::GlobalVariable *Var = Info.Var;
1083
171
    if (Info.Flags.getKind() == DeviceVarFlags::Variable &&
1084
171
        Info.Flags.isManaged()) {
1085
18
      auto *ManagedVar = new llvm::GlobalVariable(
1086
18
          CGM.getModule(), Var->getType(),
1087
18
          /*isConstant=*/false, Var->getLinkage(),
1088
18
          /*Init=*/Var->isDeclaration()
1089
18
              ? 
nullptr4
1090
18
              : 
llvm::ConstantPointerNull::get(Var->getType())14
,
1091
18
          /*Name=*/"", /*InsertBefore=*/nullptr,
1092
18
          llvm::GlobalVariable::NotThreadLocal,
1093
18
          CGM.getContext().getTargetAddressSpace(LangAS::cuda_device));
1094
18
      ManagedVar->setDSOLocal(Var->isDSOLocal());
1095
18
      ManagedVar->setVisibility(Var->getVisibility());
1096
18
      ManagedVar->setExternallyInitialized(true);
1097
18
      replaceManagedVar(Var, ManagedVar);
1098
18
      ManagedVar->takeName(Var);
1099
18
      Var->setName(Twine(ManagedVar->getName()) + ".managed");
1100
      // Keep managed variables even if they are not used in device code since
1101
      // they need to be allocated by the runtime.
1102
18
      if (!Var->isDeclaration()) {
1103
14
        assert(!ManagedVar->isDeclaration());
1104
0
        CGM.addCompilerUsedGlobal(Var);
1105
14
        CGM.addCompilerUsedGlobal(ManagedVar);
1106
14
      }
1107
18
    }
1108
171
  }
1109
128
}
1110
1111
// Creates offloading entries for all the kernels and globals that must be
1112
// registered. The linker will provide a pointer to this section so we can
1113
// register the symbols with the linked device image.
1114
2
void CGNVCUDARuntime::createOffloadingEntries() {
1115
2
  llvm::OpenMPIRBuilder OMPBuilder(CGM.getModule());
1116
2
  OMPBuilder.initialize();
1117
1118
2
  StringRef Section = CGM.getLangOpts().HIP ? 
"hip_offloading_entries"1
1119
2
                                            : 
"cuda_offloading_entries"1
;
1120
2
  for (KernelInfo &I : EmittedKernels)
1121
4
    OMPBuilder.emitOffloadingEntry(KernelHandles[I.Kernel],
1122
4
                                   getDeviceSideName(cast<NamedDecl>(I.D)), 0,
1123
4
                                   DeviceVarFlags::OffloadGlobalEntry, Section);
1124
1125
2
  for (VarInfo &I : DeviceVars) {
1126
2
    uint64_t VarSize =
1127
2
        CGM.getDataLayout().getTypeAllocSize(I.Var->getValueType());
1128
2
    if (I.Flags.getKind() == DeviceVarFlags::Variable) {
1129
2
      OMPBuilder.emitOffloadingEntry(
1130
2
          I.Var, getDeviceSideName(I.D), VarSize,
1131
2
          I.Flags.isManaged() ? 
DeviceVarFlags::OffloadGlobalManagedEntry0
1132
2
                              : DeviceVarFlags::OffloadGlobalEntry,
1133
2
          Section);
1134
2
    } else 
if (0
I.Flags.getKind() == DeviceVarFlags::Surface0
) {
1135
0
      OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
1136
0
                                     DeviceVarFlags::OffloadGlobalSurfaceEntry,
1137
0
                                     Section);
1138
0
    } else if (I.Flags.getKind() == DeviceVarFlags::Texture) {
1139
0
      OMPBuilder.emitOffloadingEntry(I.Var, getDeviceSideName(I.D), VarSize,
1140
0
                                     DeviceVarFlags::OffloadGlobalTextureEntry,
1141
0
                                     Section);
1142
0
    }
1143
2
  }
1144
2
}
1145
1146
// Returns module constructor to be added.
1147
201
llvm::Function *CGNVCUDARuntime::finalizeModule() {
1148
201
  if (CGM.getLangOpts().CUDAIsDevice) {
1149
128
    transformManagedVars();
1150
1151
    // Mark ODR-used device variables as compiler used to prevent it from being
1152
    // eliminated by optimization. This is necessary for device variables
1153
    // ODR-used by host functions. Sema correctly marks them as ODR-used no
1154
    // matter whether they are ODR-used by device or host functions.
1155
    //
1156
    // We do not need to do this if the variable has used attribute since it
1157
    // has already been added.
1158
    //
1159
    // Static device variables have been externalized at this point, therefore
1160
    // variables with LLVM private or internal linkage need not be added.
1161
171
    for (auto &&Info : DeviceVars) {
1162
171
      auto Kind = Info.Flags.getKind();
1163
171
      if (!Info.Var->isDeclaration() &&
1164
171
          
!llvm::GlobalValue::isLocalLinkage(Info.Var->getLinkage())167
&&
1165
171
          
(132
Kind == DeviceVarFlags::Variable132
||
1166
132
           
Kind == DeviceVarFlags::Surface0
||
1167
132
           
Kind == DeviceVarFlags::Texture0
) &&
1168
171
          
Info.D->isUsed()132
&&
!Info.D->hasAttr<UsedAttr>()85
) {
1169
82
        CGM.addCompilerUsedGlobal(Info.Var);
1170
82
      }
1171
171
    }
1172
128
    return nullptr;
1173
128
  }
1174
73
  if (CGM.getLangOpts().OffloadingNewDriver && 
RelocatableDeviceCode2
)
1175
2
    createOffloadingEntries();
1176
71
  else
1177
71
    return makeModuleCtorFunction();
1178
1179
2
  return nullptr;
1180
73
}
1181
1182
llvm::GlobalValue *CGNVCUDARuntime::getKernelHandle(llvm::Function *F,
1183
186
                                                    GlobalDecl GD) {
1184
186
  auto Loc = KernelHandles.find(F);
1185
186
  if (Loc != KernelHandles.end())
1186
95
    return Loc->second;
1187
1188
91
  if (!CGM.getLangOpts().HIP) {
1189
25
    KernelHandles[F] = F;
1190
25
    KernelStubs[F] = F;
1191
25
    return F;
1192
25
  }
1193
1194
66
  auto *Var = new llvm::GlobalVariable(
1195
66
      TheModule, F->getType(), /*isConstant=*/true, F->getLinkage(),
1196
66
      /*Initializer=*/nullptr,
1197
66
      CGM.getMangledName(
1198
66
          GD.getWithKernelReferenceKind(KernelReferenceKind::Kernel)));
1199
66
  Var->setAlignment(CGM.getPointerAlign().getAsAlign());
1200
66
  Var->setDSOLocal(F->isDSOLocal());
1201
66
  Var->setVisibility(F->getVisibility());
1202
66
  CGM.maybeSetTrivialComdat(*GD.getDecl(), *Var);
1203
66
  KernelHandles[F] = Var;
1204
66
  KernelStubs[Var] = F;
1205
66
  return Var;
1206
91
}