Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
Line
Count
Source
1
//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file This pass replaces accesses to kernel arguments with loads from
10
/// offsets from the kernarg base pointer.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "AMDGPU.h"
15
#include "AMDGPUSubtarget.h"
16
#include "AMDGPUTargetMachine.h"
17
#include "llvm/ADT/StringRef.h"
18
#include "llvm/Analysis/Loads.h"
19
#include "llvm/CodeGen/Passes.h"
20
#include "llvm/CodeGen/TargetPassConfig.h"
21
#include "llvm/IR/Attributes.h"
22
#include "llvm/IR/BasicBlock.h"
23
#include "llvm/IR/Constants.h"
24
#include "llvm/IR/DerivedTypes.h"
25
#include "llvm/IR/Function.h"
26
#include "llvm/IR/IRBuilder.h"
27
#include "llvm/IR/InstrTypes.h"
28
#include "llvm/IR/Instruction.h"
29
#include "llvm/IR/Instructions.h"
30
#include "llvm/IR/LLVMContext.h"
31
#include "llvm/IR/MDBuilder.h"
32
#include "llvm/IR/Metadata.h"
33
#include "llvm/IR/Operator.h"
34
#include "llvm/IR/Type.h"
35
#include "llvm/IR/Value.h"
36
#include "llvm/Pass.h"
37
#include "llvm/Support/Casting.h"
38
39
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
40
41
using namespace llvm;
42
43
namespace {
44
45
class AMDGPULowerKernelArguments : public FunctionPass{
46
public:
47
  static char ID;
48
49
2.44k
  AMDGPULowerKernelArguments() : FunctionPass(ID) {}
50
51
  bool runOnFunction(Function &F) override;
52
53
2.42k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
54
2.42k
    AU.addRequired<TargetPassConfig>();
55
2.42k
    AU.setPreservesAll();
56
2.42k
 }
57
};
58
59
} // end anonymous namespace
60
61
25.2k
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
62
25.2k
  CallingConv::ID CC = F.getCallingConv();
63
25.2k
  if (CC != CallingConv::AMDGPU_KERNEL || 
F.arg_empty()19.9k
)
64
7.14k
    return false;
65
18.0k
66
18.0k
  auto &TPC = getAnalysis<TargetPassConfig>();
67
18.0k
68
18.0k
  const TargetMachine &TM = TPC.getTM<TargetMachine>();
69
18.0k
  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
70
18.0k
  LLVMContext &Ctx = F.getParent()->getContext();
71
18.0k
  const DataLayout &DL = F.getParent()->getDataLayout();
72
18.0k
  BasicBlock &EntryBlock = *F.begin();
73
18.0k
  IRBuilder<> Builder(&*EntryBlock.begin());
74
18.0k
75
18.0k
  const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
76
18.0k
  const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
77
18.0k
78
18.0k
  unsigned MaxAlign;
79
18.0k
  // FIXME: Alignment is broken broken with explicit arg offset.;
80
18.0k
  const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
81
18.0k
  if (TotalKernArgSize == 0)
82
2
    return false;
83
18.0k
84
18.0k
  CallInst *KernArgSegment =
85
18.0k
      Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, {}, {},
86
18.0k
                              nullptr, F.getName() + ".kernarg.segment");
87
18.0k
88
18.0k
  KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
89
18.0k
  KernArgSegment->addAttribute(AttributeList::ReturnIndex,
90
18.0k
    Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
91
18.0k
92
18.0k
  unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
93
18.0k
  uint64_t ExplicitArgOffset = 0;
94
18.0k
95
42.0k
  for (Argument &Arg : F.args()) {
96
42.0k
    Type *ArgTy = Arg.getType();
97
42.0k
    unsigned Align = DL.getABITypeAlignment(ArgTy);
98
42.0k
    unsigned Size = DL.getTypeSizeInBits(ArgTy);
99
42.0k
    unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
100
42.0k
101
42.0k
    uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
102
42.0k
    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
103
42.0k
104
42.0k
    if (Arg.use_empty())
105
2.71k
      continue;
106
39.2k
107
39.2k
    if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
108
28.1k
      // FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
109
28.1k
      // modes on SI to know the high bits are 0 so pointer adds don't wrap. We
110
28.1k
      // can't represent this with range metadata because it's only allowed for
111
28.1k
      // integer types.
112
28.1k
      if ((PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
113
28.1k
           
PT->getAddressSpace() == AMDGPUAS::REGION_ADDRESS25.3k
) &&
114
28.1k
          
!ST.hasUsableDSOffset()2.84k
)
115
672
        continue;
116
27.4k
117
27.4k
      // FIXME: We can replace this with equivalent alias.scope/noalias
118
27.4k
      // metadata, but this appears to be a lot of work.
119
27.4k
      if (Arg.hasNoAliasAttr())
120
1.34k
        continue;
121
37.2k
    }
122
37.2k
123
37.2k
    VectorType *VT = dyn_cast<VectorType>(ArgTy);
124
37.2k
    bool IsV3 = VT && 
VT->getNumElements() == 31.59k
;
125
37.2k
    bool DoShiftOpt = Size < 32 && 
!ArgTy->isAggregateType()722
;
126
37.2k
127
37.2k
    VectorType *V4Ty = nullptr;
128
37.2k
129
37.2k
    int64_t AlignDownOffset = alignDown(EltOffset, 4);
130
37.2k
    int64_t OffsetDiff = EltOffset - AlignDownOffset;
131
37.2k
    unsigned AdjustedAlign = MinAlign(DoShiftOpt ? 
AlignDownOffset713
:
EltOffset36.5k
,
132
37.2k
                                      KernArgBaseAlign);
133
37.2k
134
37.2k
    Value *ArgPtr;
135
37.2k
    Type *AdjustedArgTy;
136
37.2k
    if (DoShiftOpt) { // FIXME: Handle aggregate types
137
713
      // Since we don't have sub-dword scalar loads, avoid doing an extload by
138
713
      // loading earlier than the argument address, and extracting the relevant
139
713
      // bits.
140
713
      //
141
713
      // Additionally widen any sub-dword load to i32 even if suitably aligned,
142
713
      // so that CSE between different argument loads works easily.
143
713
      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
144
713
          Builder.getInt8Ty(), KernArgSegment, AlignDownOffset,
145
713
          Arg.getName() + ".kernarg.offset.align.down");
146
713
      AdjustedArgTy = Builder.getInt32Ty();
147
36.5k
    } else {
148
36.5k
      ArgPtr = Builder.CreateConstInBoundsGEP1_64(
149
36.5k
          Builder.getInt8Ty(), KernArgSegment, EltOffset,
150
36.5k
          Arg.getName() + ".kernarg.offset");
151
36.5k
      AdjustedArgTy = ArgTy;
152
36.5k
    }
153
37.2k
154
37.2k
    if (IsV3 && 
Size >= 32127
) {
155
112
      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
156
112
      // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
157
112
      AdjustedArgTy = V4Ty;
158
112
    }
159
37.2k
160
37.2k
    ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
161
37.2k
                                   ArgPtr->getName() + ".cast");
162
37.2k
    LoadInst *Load =
163
37.2k
        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
164
37.2k
    Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
165
37.2k
166
37.2k
    MDBuilder MDB(Ctx);
167
37.2k
168
37.2k
    if (isa<PointerType>(ArgTy)) {
169
26.1k
      if (Arg.hasNonNullAttr())
170
4
        Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
171
26.1k
172
26.1k
      uint64_t DerefBytes = Arg.getDereferenceableBytes();
173
26.1k
      if (DerefBytes != 0) {
174
4
        Load->setMetadata(
175
4
          LLVMContext::MD_dereferenceable,
176
4
          MDNode::get(Ctx,
177
4
                      MDB.createConstant(
178
4
                        ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
179
4
      }
180
26.1k
181
26.1k
      uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
182
26.1k
      if (DerefOrNullBytes != 0) {
183
2
        Load->setMetadata(
184
2
          LLVMContext::MD_dereferenceable_or_null,
185
2
          MDNode::get(Ctx,
186
2
                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
187
2
                                                          DerefOrNullBytes))));
188
2
      }
189
26.1k
190
26.1k
      unsigned ParamAlign = Arg.getParamAlignment();
191
26.1k
      if (ParamAlign != 0) {
192
3
        Load->setMetadata(
193
3
          LLVMContext::MD_align,
194
3
          MDNode::get(Ctx,
195
3
                      MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
196
3
                                                          ParamAlign))));
197
3
      }
198
26.1k
    }
199
37.2k
200
37.2k
    // TODO: Convert noalias arg to !noalias
201
37.2k
202
37.2k
    if (DoShiftOpt) {
203
713
      Value *ExtractBits = OffsetDiff == 0 ?
204
593
        Load : 
Builder.CreateLShr(Load, OffsetDiff * 8)120
;
205
713
206
713
      IntegerType *ArgIntTy = Builder.getIntNTy(Size);
207
713
      Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
208
713
      Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
209
713
                                            Arg.getName() + ".load");
210
713
      Arg.replaceAllUsesWith(NewVal);
211
36.5k
    } else if (IsV3) {
212
112
      Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
213
112
                                                {0, 1, 2},
214
112
                                                Arg.getName() + ".load");
215
112
      Arg.replaceAllUsesWith(Shuf);
216
36.4k
    } else {
217
36.4k
      Load->setName(Arg.getName() + ".load");
218
36.4k
      Arg.replaceAllUsesWith(Load);
219
36.4k
    }
220
37.2k
  }
221
18.0k
222
18.0k
  KernArgSegment->addAttribute(
223
18.0k
    AttributeList::ReturnIndex,
224
18.0k
    Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
225
18.0k
226
18.0k
  return true;
227
18.0k
}
228
229
101k
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
230
101k
                      "AMDGPU Lower Kernel Arguments", false, false)
231
101k
INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
232
                    false, false)
233
234
char AMDGPULowerKernelArguments::ID = 0;
235
236
2.44k
FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
237
2.44k
  return new AMDGPULowerKernelArguments();
238
2.44k
}