Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AMDGPUInline.cpp - Code to perform simple function inlining --------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// This is AMDGPU specific replacement of the standard inliner.
11
/// The main purpose is to account for the fact that calls not only expensive
12
/// on the AMDGPU, but much more expensive if a private memory pointer is
13
/// passed to a function as an argument. In this situation, we are unable to
14
/// eliminate private memory in the caller unless inlined and end up with slow
15
/// and expensive scratch access. Thus, we boost the inline threshold for such
16
/// functions here.
17
///
18
//===----------------------------------------------------------------------===//
19
20
21
#include "AMDGPU.h"
22
#include "llvm/Transforms/IPO.h"
23
#include "llvm/Analysis/AssumptionCache.h"
24
#include "llvm/Analysis/CallGraph.h"
25
#include "llvm/Analysis/InlineCost.h"
26
#include "llvm/Analysis/ValueTracking.h"
27
#include "llvm/Analysis/TargetTransformInfo.h"
28
#include "llvm/IR/CallSite.h"
29
#include "llvm/IR/DataLayout.h"
30
#include "llvm/IR/Instructions.h"
31
#include "llvm/IR/Module.h"
32
#include "llvm/IR/Type.h"
33
#include "llvm/Support/CommandLine.h"
34
#include "llvm/Support/Debug.h"
35
#include "llvm/Transforms/IPO/Inliner.h"
36
37
using namespace llvm;
38
39
57
#define DEBUG_TYPE "inline"
40
41
static cl::opt<int>
42
ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(1500),
43
              cl::desc("Cost of alloca argument"));
44
45
// If the amount of scratch memory to eliminate exceeds our ability to allocate
46
// it into registers we gain nothing by aggressively inlining functions for that
47
// heuristic.
48
static cl::opt<unsigned>
49
ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
50
                cl::desc("Maximum alloca size to use for inline cost"));
51
52
// Inliner constraint to achieve reasonable compilation time
53
static cl::opt<size_t>
54
MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
55
      cl::desc("Maximum BB number allowed in a function after inlining"
56
               " (compile time constraint)"));
57
58
namespace {
59
60
class AMDGPUInliner : public LegacyInlinerBase {
61
62
public:
63
162
  AMDGPUInliner() : LegacyInlinerBase(ID) {
64
162
    initializeAMDGPUInlinerPass(*PassRegistry::getPassRegistry());
65
162
    Params = getInlineParams();
66
162
  }
67
68
  static char ID; // Pass identification, replacement for typeid
69
70
  unsigned getInlineThreshold(CallSite CS) const;
71
72
  InlineCost getInlineCost(CallSite CS) override;
73
74
  bool runOnSCC(CallGraphSCC &SCC) override;
75
76
  void getAnalysisUsage(AnalysisUsage &AU) const override;
77
78
private:
79
  TargetTransformInfoWrapperPass *TTIWP;
80
81
  InlineParams Params;
82
};
83
84
} // end anonymous namespace
85
86
char AMDGPUInliner::ID = 0;
87
101k
INITIALIZE_PASS_BEGIN(AMDGPUInliner, "amdgpu-inline",
88
101k
                "AMDGPU Function Integration/Inlining", false, false)
89
101k
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
90
101k
INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
91
101k
INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
92
101k
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
93
101k
INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
94
101k
INITIALIZE_PASS_END(AMDGPUInliner, "amdgpu-inline",
95
                "AMDGPU Function Integration/Inlining", false, false)
96
97
157
Pass *llvm::createAMDGPUFunctionInliningPass() { return new AMDGPUInliner(); }
98
99
1.95k
bool AMDGPUInliner::runOnSCC(CallGraphSCC &SCC) {
100
1.95k
  TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
101
1.95k
  return LegacyInlinerBase::runOnSCC(SCC);
102
1.95k
}
103
104
162
void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
105
162
  AU.addRequired<TargetTransformInfoWrapperPass>();
106
162
  LegacyInlinerBase::getAnalysisUsage(AU);
107
162
}
108
109
57
unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
110
57
  int Thres = Params.DefaultThreshold;
111
57
112
57
  Function *Caller = CS.getCaller();
113
57
  // Listen to the inlinehint attribute when it would increase the threshold
114
57
  // and the caller does not need to minimize its size.
115
57
  Function *Callee = CS.getCalledFunction();
116
57
  bool InlineHint = Callee && !Callee->isDeclaration() &&
117
57
    Callee->hasFnAttribute(Attribute::InlineHint);
118
57
  if (InlineHint && 
Params.HintThreshold3
&&
Params.HintThreshold > Thres3
119
57
      && 
!Caller->hasFnAttribute(Attribute::MinSize)3
)
120
3
    Thres = Params.HintThreshold.getValue() *
121
3
            TTIWP->getTTI(*Callee).getInliningThresholdMultiplier();
122
57
123
57
  const DataLayout &DL = Caller->getParent()->getDataLayout();
124
57
  if (!Callee)
125
0
    return (unsigned)Thres;
126
57
127
57
  // If we have a pointer to private array passed into a function
128
57
  // it will not be optimized out, leaving scratch usage.
129
57
  // Increase the inline threshold to allow inliniting in this case.
130
57
  uint64_t AllocaSize = 0;
131
57
  SmallPtrSet<const AllocaInst *, 8> AIVisited;
132
57
  for (Value *PtrArg : CS.args()) {
133
52
    PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
134
52
    if (!Ty || 
(29
Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS29
&&
135
29
                
Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS14
))
136
24
      continue;
137
28
138
28
    PtrArg = GetUnderlyingObject(PtrArg, DL);
139
28
    if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
140
26
      if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
141
2
        continue;
142
24
      AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
143
24
      // If the amount of stack memory is excessive we will not be able
144
24
      // to get rid of the scratch anyway, bail out.
145
24
      if (AllocaSize > ArgAllocaCutoff) {
146
2
        AllocaSize = 0;
147
2
        break;
148
2
      }
149
24
    }
150
28
  }
151
57
  if (AllocaSize)
152
18
    Thres += ArgAllocaCost;
153
57
154
57
  return (unsigned)Thres;
155
57
}
156
157
// Check if call is just a wrapper around another call.
158
// In this case we only have call and ret instructions.
159
59
static bool isWrapperOnlyCall(CallSite CS) {
160
59
  Function *Callee = CS.getCalledFunction();
161
59
  if (!Callee || Callee->size() != 1)
162
9
    return false;
163
50
  const BasicBlock &BB = Callee->getEntryBlock();
164
50
  if (const Instruction *I = BB.getFirstNonPHI()) {
165
50
    if (!isa<CallInst>(I)) {
166
47
      return false;
167
47
    }
168
3
    if (isa<ReturnInst>(*std::next(I->getIterator()))) {
169
2
      LLVM_DEBUG(dbgs() << "    Wrapper only call detected: "
170
2
                        << Callee->getName() << '\n');
171
2
      return true;
172
2
    }
173
1
  }
174
1
  return false;
175
1
}
176
177
230
InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
178
230
  Function *Callee = CS.getCalledFunction();
179
230
  Function *Caller = CS.getCaller();
180
230
181
230
  if (!Callee || Callee->isDeclaration())
182
0
    return llvm::InlineCost::getNever("undefined callee");
183
230
184
230
  if (CS.isNoInline())
185
168
    return llvm::InlineCost::getNever("noinline");
186
62
187
62
  TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
188
62
  if (!TTI.areInlineCompatible(Caller, Callee))
189
0
    return llvm::InlineCost::getNever("incompatible");
190
62
191
62
  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
192
3
    auto IsViable = isInlineViable(*Callee);
193
3
    if (IsViable)
194
3
      return llvm::InlineCost::getAlways("alwaysinline viable");
195
0
    return llvm::InlineCost::getNever(IsViable.message);
196
0
  }
197
59
198
59
  if (isWrapperOnlyCall(CS))
199
2
    return llvm::InlineCost::getAlways("wrapper-only call");
200
57
201
57
  InlineParams LocalParams = Params;
202
57
  LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
203
57
  bool RemarksEnabled = false;
204
57
  const auto &BBs = Caller->getBasicBlockList();
205
57
  if (!BBs.empty()) {
206
57
    auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
207
57
    if (DI.isEnabled())
208
0
      RemarksEnabled = true;
209
57
  }
210
57
211
57
  OptimizationRemarkEmitter ORE(Caller);
212
57
  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
213
57
      [this](Function &F) -> AssumptionCache & {
214
54
    return ACT->getAssumptionCache(F);
215
54
  };
216
57
217
57
  auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
218
57
                             LocalParams, TTI, GetAssumptionCache, None, PSI,
219
57
                             RemarksEnabled ? 
&ORE0
: nullptr);
220
57
221
57
  if (IC && 
!IC.isAlways()52
&&
!Callee->hasFnAttribute(Attribute::InlineHint)52
) {
222
49
    // Single BB does not increase total BB amount, thus subtract 1
223
49
    size_t Size = Caller->size() + Callee->size() - 1;
224
49
    if (MaxBB && Size > MaxBB)
225
1
      return llvm::InlineCost::getNever("max number of bb exceeded");
226
56
  }
227
56
  return IC;
228
56
}