Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor --===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions
9
/// that may inhibit the HW prefetching.  This is done in two steps.  Before
10
/// ISel, we mark strided loads (i.e. those that will likely benefit from
11
/// prefetching) with metadata.  Then, after opcodes have been finalized, we
12
/// insert MOVs and re-write loads to prevent unintentional tag collisions.
13
// ===---------------------------------------------------------------------===//
14
15
#include "AArch64.h"
16
#include "AArch64InstrInfo.h"
17
#include "AArch64Subtarget.h"
18
#include "AArch64TargetMachine.h"
19
#include "llvm/ADT/DenseMap.h"
20
#include "llvm/ADT/DepthFirstIterator.h"
21
#include "llvm/ADT/None.h"
22
#include "llvm/ADT/Optional.h"
23
#include "llvm/ADT/SmallVector.h"
24
#include "llvm/ADT/Statistic.h"
25
#include "llvm/Analysis/LoopInfo.h"
26
#include "llvm/Analysis/ScalarEvolution.h"
27
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
28
#include "llvm/CodeGen/LiveRegUnits.h"
29
#include "llvm/CodeGen/MachineBasicBlock.h"
30
#include "llvm/CodeGen/MachineFunction.h"
31
#include "llvm/CodeGen/MachineFunctionPass.h"
32
#include "llvm/CodeGen/MachineInstr.h"
33
#include "llvm/CodeGen/MachineInstrBuilder.h"
34
#include "llvm/CodeGen/MachineLoopInfo.h"
35
#include "llvm/CodeGen/MachineOperand.h"
36
#include "llvm/CodeGen/MachineRegisterInfo.h"
37
#include "llvm/CodeGen/TargetPassConfig.h"
38
#include "llvm/CodeGen/TargetRegisterInfo.h"
39
#include "llvm/IR/DebugLoc.h"
40
#include "llvm/IR/Dominators.h"
41
#include "llvm/IR/Function.h"
42
#include "llvm/IR/Instruction.h"
43
#include "llvm/IR/Instructions.h"
44
#include "llvm/IR/Metadata.h"
45
#include "llvm/Pass.h"
46
#include "llvm/Support/Casting.h"
47
#include "llvm/Support/Debug.h"
48
#include "llvm/Support/DebugCounter.h"
49
#include "llvm/Support/raw_ostream.h"
50
#include <cassert>
51
#include <iterator>
52
#include <utility>
53
54
using namespace llvm;
55
56
#define DEBUG_TYPE "falkor-hwpf-fix"
57
58
STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
59
STATISTIC(NumCollisionsAvoided,
60
          "Number of HW prefetch tag collisions avoided");
61
STATISTIC(NumCollisionsNotAvoided,
62
          "Number of HW prefetch tag collisions not avoided due to lack of registers");
63
DEBUG_COUNTER(FixCounter, "falkor-hwpf",
64
              "Controls which tag collisions are avoided");
65
66
namespace {
67
68
class FalkorMarkStridedAccesses {
69
public:
70
  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
71
69
      : LI(LI), SE(SE) {}
72
73
  bool run();
74
75
private:
76
  bool runOnLoop(Loop &L);
77
78
  LoopInfo &LI;
79
  ScalarEvolution &SE;
80
};
81
82
class FalkorMarkStridedAccessesLegacy : public FunctionPass {
83
public:
84
  static char ID; // Pass ID, replacement for typeid
85
86
8.62k
  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
87
8.62k
    initializeFalkorMarkStridedAccessesLegacyPass(
88
8.62k
        *PassRegistry::getPassRegistry());
89
8.62k
  }
90
91
8.59k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
92
8.59k
    AU.addRequired<TargetPassConfig>();
93
8.59k
    AU.addPreserved<DominatorTreeWrapperPass>();
94
8.59k
    AU.addRequired<LoopInfoWrapperPass>();
95
8.59k
    AU.addPreserved<LoopInfoWrapperPass>();
96
8.59k
    AU.addRequired<ScalarEvolutionWrapperPass>();
97
8.59k
    AU.addPreserved<ScalarEvolutionWrapperPass>();
98
8.59k
  }
99
100
  bool runOnFunction(Function &F) override;
101
};
102
103
} // end anonymous namespace
104
105
char FalkorMarkStridedAccessesLegacy::ID = 0;
106
107
101k
INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
108
101k
                      "Falkor HW Prefetch Fix", false, false)
109
101k
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
110
101k
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
111
101k
INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
112
101k
INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
113
                    "Falkor HW Prefetch Fix", false, false)
114
115
8.62k
FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
116
8.62k
  return new FalkorMarkStridedAccessesLegacy();
117
8.62k
}
118
119
257k
bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
120
257k
  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
121
257k
  const AArch64Subtarget *ST =
122
257k
      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
123
257k
  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
124
257k
    return false;
125
69
126
69
  if (skipFunction(F))
127
0
    return false;
128
69
129
69
  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
130
69
  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
131
69
132
69
  FalkorMarkStridedAccesses LDP(LI, SE);
133
69
  return LDP.run();
134
69
}
135
136
69
bool FalkorMarkStridedAccesses::run() {
137
69
  bool MadeChange = false;
138
69
139
69
  for (Loop *L : LI)
140
11
    
for (auto LIt = df_begin(L), LE = df_end(L); 5
LIt != LE;
++LIt6
)
141
6
      MadeChange |= runOnLoop(**LIt);
142
69
143
69
  return MadeChange;
144
69
}
145
146
6
bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) {
147
6
  // Only mark strided loads in the inner-most loop
148
6
  if (!L.empty())
149
1
    return false;
150
5
151
5
  bool MadeChange = false;
152
5
153
5
  for (BasicBlock *BB : L.blocks()) {
154
65
    for (Instruction &I : *BB) {
155
65
      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
156
65
      if (!LoadI)
157
52
        continue;
158
13
159
13
      Value *PtrValue = LoadI->getPointerOperand();
160
13
      if (L.isLoopInvariant(PtrValue))
161
0
        continue;
162
13
163
13
      const SCEV *LSCEV = SE.getSCEV(PtrValue);
164
13
      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
165
13
      if (!LSCEVAddRec || 
!LSCEVAddRec->isAffine()12
)
166
1
        continue;
167
12
168
12
      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
169
12
                         MDNode::get(LoadI->getContext(), {}));
170
12
      ++NumStridedLoadsMarked;
171
12
      LLVM_DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
172
12
      MadeChange = true;
173
12
    }
174
5
  }
175
5
176
5
  return MadeChange;
177
5
}
178
179
namespace {
180
181
class FalkorHWPFFix : public MachineFunctionPass {
182
public:
183
  static char ID;
184
185
8.62k
  FalkorHWPFFix() : MachineFunctionPass(ID) {
186
8.62k
    initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry());
187
8.62k
  }
188
189
  bool runOnMachineFunction(MachineFunction &Fn) override;
190
191
8.58k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
192
8.58k
    AU.setPreservesCFG();
193
8.58k
    AU.addRequired<MachineLoopInfo>();
194
8.58k
    MachineFunctionPass::getAnalysisUsage(AU);
195
8.58k
  }
196
197
8.58k
  MachineFunctionProperties getRequiredProperties() const override {
198
8.58k
    return MachineFunctionProperties().set(
199
8.58k
        MachineFunctionProperties::Property::NoVRegs);
200
8.58k
  }
201
202
private:
203
  void runOnLoop(MachineLoop &L, MachineFunction &Fn);
204
205
  const AArch64InstrInfo *TII;
206
  const TargetRegisterInfo *TRI;
207
  DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap;
208
  bool Modified;
209
};
210
211
/// Bits from load opcodes used to compute HW prefetcher instruction tags.
212
struct LoadInfo {
213
58
  LoadInfo() = default;
214
215
  Register DestReg;
216
  Register BaseReg;
217
  int BaseRegIdx = -1;
218
  const MachineOperand *OffsetOpnd = nullptr;
219
  bool IsPrePost = false;
220
};
221
222
} // end anonymous namespace
223
224
char FalkorHWPFFix::ID = 0;
225
226
101k
INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late",
227
101k
                      "Falkor HW Prefetch Fix Late Phase", false, false)
228
101k
INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
229
101k
INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late",
230
                    "Falkor HW Prefetch Fix Late Phase", false, false)
231
232
76
static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) {
233
76
  return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8);
234
76
}
235
236
132
static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) {
237
132
  int DestRegIdx;
238
132
  int BaseRegIdx;
239
132
  int OffsetIdx;
240
132
  bool IsPrePost;
241
132
242
132
  switch (MI.getOpcode()) {
243
132
  default:
244
73
    return None;
245
132
246
132
  case AArch64::LD1i64:
247
2
  case AArch64::LD2i64:
248
2
    DestRegIdx = 0;
249
2
    BaseRegIdx = 3;
250
2
    OffsetIdx = -1;
251
2
    IsPrePost = false;
252
2
    break;
253
2
254
2
  case AArch64::LD1i8:
255
2
  case AArch64::LD1i16:
256
2
  case AArch64::LD1i32:
257
2
  case AArch64::LD2i8:
258
2
  case AArch64::LD2i16:
259
2
  case AArch64::LD2i32:
260
2
  case AArch64::LD3i8:
261
2
  case AArch64::LD3i16:
262
2
  case AArch64::LD3i32:
263
2
  case AArch64::LD3i64:
264
2
  case AArch64::LD4i8:
265
2
  case AArch64::LD4i16:
266
2
  case AArch64::LD4i32:
267
2
  case AArch64::LD4i64:
268
2
    DestRegIdx = -1;
269
2
    BaseRegIdx = 3;
270
2
    OffsetIdx = -1;
271
2
    IsPrePost = false;
272
2
    break;
273
2
274
2
  case AArch64::LD1Onev1d:
275
2
  case AArch64::LD1Onev2s:
276
2
  case AArch64::LD1Onev4h:
277
2
  case AArch64::LD1Onev8b:
278
2
  case AArch64::LD1Onev2d:
279
2
  case AArch64::LD1Onev4s:
280
2
  case AArch64::LD1Onev8h:
281
2
  case AArch64::LD1Onev16b:
282
2
  case AArch64::LD1Rv1d:
283
2
  case AArch64::LD1Rv2s:
284
2
  case AArch64::LD1Rv4h:
285
2
  case AArch64::LD1Rv8b:
286
2
  case AArch64::LD1Rv2d:
287
2
  case AArch64::LD1Rv4s:
288
2
  case AArch64::LD1Rv8h:
289
2
  case AArch64::LD1Rv16b:
290
2
    DestRegIdx = 0;
291
2
    BaseRegIdx = 1;
292
2
    OffsetIdx = -1;
293
2
    IsPrePost = false;
294
2
    break;
295
2
296
2
  case AArch64::LD1Twov1d:
297
2
  case AArch64::LD1Twov2s:
298
2
  case AArch64::LD1Twov4h:
299
2
  case AArch64::LD1Twov8b:
300
2
  case AArch64::LD1Twov2d:
301
2
  case AArch64::LD1Twov4s:
302
2
  case AArch64::LD1Twov8h:
303
2
  case AArch64::LD1Twov16b:
304
2
  case AArch64::LD1Threev1d:
305
2
  case AArch64::LD1Threev2s:
306
2
  case AArch64::LD1Threev4h:
307
2
  case AArch64::LD1Threev8b:
308
2
  case AArch64::LD1Threev2d:
309
2
  case AArch64::LD1Threev4s:
310
2
  case AArch64::LD1Threev8h:
311
2
  case AArch64::LD1Threev16b:
312
2
  case AArch64::LD1Fourv1d:
313
2
  case AArch64::LD1Fourv2s:
314
2
  case AArch64::LD1Fourv4h:
315
2
  case AArch64::LD1Fourv8b:
316
2
  case AArch64::LD1Fourv2d:
317
2
  case AArch64::LD1Fourv4s:
318
2
  case AArch64::LD1Fourv8h:
319
2
  case AArch64::LD1Fourv16b:
320
2
  case AArch64::LD2Twov2s:
321
2
  case AArch64::LD2Twov4s:
322
2
  case AArch64::LD2Twov8b:
323
2
  case AArch64::LD2Twov2d:
324
2
  case AArch64::LD2Twov4h:
325
2
  case AArch64::LD2Twov8h:
326
2
  case AArch64::LD2Twov16b:
327
2
  case AArch64::LD2Rv1d:
328
2
  case AArch64::LD2Rv2s:
329
2
  case AArch64::LD2Rv4s:
330
2
  case AArch64::LD2Rv8b:
331
2
  case AArch64::LD2Rv2d:
332
2
  case AArch64::LD2Rv4h:
333
2
  case AArch64::LD2Rv8h:
334
2
  case AArch64::LD2Rv16b:
335
2
  case AArch64::LD3Threev2s:
336
2
  case AArch64::LD3Threev4h:
337
2
  case AArch64::LD3Threev8b:
338
2
  case AArch64::LD3Threev2d:
339
2
  case AArch64::LD3Threev4s:
340
2
  case AArch64::LD3Threev8h:
341
2
  case AArch64::LD3Threev16b:
342
2
  case AArch64::LD3Rv1d:
343
2
  case AArch64::LD3Rv2s:
344
2
  case AArch64::LD3Rv4h:
345
2
  case AArch64::LD3Rv8b:
346
2
  case AArch64::LD3Rv2d:
347
2
  case AArch64::LD3Rv4s:
348
2
  case AArch64::LD3Rv8h:
349
2
  case AArch64::LD3Rv16b:
350
2
  case AArch64::LD4Fourv2s:
351
2
  case AArch64::LD4Fourv4h:
352
2
  case AArch64::LD4Fourv8b:
353
2
  case AArch64::LD4Fourv2d:
354
2
  case AArch64::LD4Fourv4s:
355
2
  case AArch64::LD4Fourv8h:
356
2
  case AArch64::LD4Fourv16b:
357
2
  case AArch64::LD4Rv1d:
358
2
  case AArch64::LD4Rv2s:
359
2
  case AArch64::LD4Rv4h:
360
2
  case AArch64::LD4Rv8b:
361
2
  case AArch64::LD4Rv2d:
362
2
  case AArch64::LD4Rv4s:
363
2
  case AArch64::LD4Rv8h:
364
2
  case AArch64::LD4Rv16b:
365
2
    DestRegIdx = -1;
366
2
    BaseRegIdx = 1;
367
2
    OffsetIdx = -1;
368
2
    IsPrePost = false;
369
2
    break;
370
2
371
2
  case AArch64::LD1i64_POST:
372
2
  case AArch64::LD2i64_POST:
373
2
    DestRegIdx = 1;
374
2
    BaseRegIdx = 4;
375
2
    OffsetIdx = 5;
376
2
    IsPrePost = true;
377
2
    break;
378
2
379
2
  case AArch64::LD1i8_POST:
380
2
  case AArch64::LD1i16_POST:
381
2
  case AArch64::LD1i32_POST:
382
2
  case AArch64::LD2i8_POST:
383
2
  case AArch64::LD2i16_POST:
384
2
  case AArch64::LD2i32_POST:
385
2
  case AArch64::LD3i8_POST:
386
2
  case AArch64::LD3i16_POST:
387
2
  case AArch64::LD3i32_POST:
388
2
  case AArch64::LD3i64_POST:
389
2
  case AArch64::LD4i8_POST:
390
2
  case AArch64::LD4i16_POST:
391
2
  case AArch64::LD4i32_POST:
392
2
  case AArch64::LD4i64_POST:
393
2
    DestRegIdx = -1;
394
2
    BaseRegIdx = 4;
395
2
    OffsetIdx = 5;
396
2
    IsPrePost = true;
397
2
    break;
398
2
399
2
  case AArch64::LD1Onev1d_POST:
400
2
  case AArch64::LD1Onev2s_POST:
401
2
  case AArch64::LD1Onev4h_POST:
402
2
  case AArch64::LD1Onev8b_POST:
403
2
  case AArch64::LD1Onev2d_POST:
404
2
  case AArch64::LD1Onev4s_POST:
405
2
  case AArch64::LD1Onev8h_POST:
406
2
  case AArch64::LD1Onev16b_POST:
407
2
  case AArch64::LD1Rv1d_POST:
408
2
  case AArch64::LD1Rv2s_POST:
409
2
  case AArch64::LD1Rv4h_POST:
410
2
  case AArch64::LD1Rv8b_POST:
411
2
  case AArch64::LD1Rv2d_POST:
412
2
  case AArch64::LD1Rv4s_POST:
413
2
  case AArch64::LD1Rv8h_POST:
414
2
  case AArch64::LD1Rv16b_POST:
415
2
    DestRegIdx = 1;
416
2
    BaseRegIdx = 2;
417
2
    OffsetIdx = 3;
418
2
    IsPrePost = true;
419
2
    break;
420
2
421
2
  case AArch64::LD1Twov1d_POST:
422
2
  case AArch64::LD1Twov2s_POST:
423
2
  case AArch64::LD1Twov4h_POST:
424
2
  case AArch64::LD1Twov8b_POST:
425
2
  case AArch64::LD1Twov2d_POST:
426
2
  case AArch64::LD1Twov4s_POST:
427
2
  case AArch64::LD1Twov8h_POST:
428
2
  case AArch64::LD1Twov16b_POST:
429
2
  case AArch64::LD1Threev1d_POST:
430
2
  case AArch64::LD1Threev2s_POST:
431
2
  case AArch64::LD1Threev4h_POST:
432
2
  case AArch64::LD1Threev8b_POST:
433
2
  case AArch64::LD1Threev2d_POST:
434
2
  case AArch64::LD1Threev4s_POST:
435
2
  case AArch64::LD1Threev8h_POST:
436
2
  case AArch64::LD1Threev16b_POST:
437
2
  case AArch64::LD1Fourv1d_POST:
438
2
  case AArch64::LD1Fourv2s_POST:
439
2
  case AArch64::LD1Fourv4h_POST:
440
2
  case AArch64::LD1Fourv8b_POST:
441
2
  case AArch64::LD1Fourv2d_POST:
442
2
  case AArch64::LD1Fourv4s_POST:
443
2
  case AArch64::LD1Fourv8h_POST:
444
2
  case AArch64::LD1Fourv16b_POST:
445
2
  case AArch64::LD2Twov2s_POST:
446
2
  case AArch64::LD2Twov4s_POST:
447
2
  case AArch64::LD2Twov8b_POST:
448
2
  case AArch64::LD2Twov2d_POST:
449
2
  case AArch64::LD2Twov4h_POST:
450
2
  case AArch64::LD2Twov8h_POST:
451
2
  case AArch64::LD2Twov16b_POST:
452
2
  case AArch64::LD2Rv1d_POST:
453
2
  case AArch64::LD2Rv2s_POST:
454
2
  case AArch64::LD2Rv4s_POST:
455
2
  case AArch64::LD2Rv8b_POST:
456
2
  case AArch64::LD2Rv2d_POST:
457
2
  case AArch64::LD2Rv4h_POST:
458
2
  case AArch64::LD2Rv8h_POST:
459
2
  case AArch64::LD2Rv16b_POST:
460
2
  case AArch64::LD3Threev2s_POST:
461
2
  case AArch64::LD3Threev4h_POST:
462
2
  case AArch64::LD3Threev8b_POST:
463
2
  case AArch64::LD3Threev2d_POST:
464
2
  case AArch64::LD3Threev4s_POST:
465
2
  case AArch64::LD3Threev8h_POST:
466
2
  case AArch64::LD3Threev16b_POST:
467
2
  case AArch64::LD3Rv1d_POST:
468
2
  case AArch64::LD3Rv2s_POST:
469
2
  case AArch64::LD3Rv4h_POST:
470
2
  case AArch64::LD3Rv8b_POST:
471
2
  case AArch64::LD3Rv2d_POST:
472
2
  case AArch64::LD3Rv4s_POST:
473
2
  case AArch64::LD3Rv8h_POST:
474
2
  case AArch64::LD3Rv16b_POST:
475
2
  case AArch64::LD4Fourv2s_POST:
476
2
  case AArch64::LD4Fourv4h_POST:
477
2
  case AArch64::LD4Fourv8b_POST:
478
2
  case AArch64::LD4Fourv2d_POST:
479
2
  case AArch64::LD4Fourv4s_POST:
480
2
  case AArch64::LD4Fourv8h_POST:
481
2
  case AArch64::LD4Fourv16b_POST:
482
2
  case AArch64::LD4Rv1d_POST:
483
2
  case AArch64::LD4Rv2s_POST:
484
2
  case AArch64::LD4Rv4h_POST:
485
2
  case AArch64::LD4Rv8b_POST:
486
2
  case AArch64::LD4Rv2d_POST:
487
2
  case AArch64::LD4Rv4s_POST:
488
2
  case AArch64::LD4Rv8h_POST:
489
2
  case AArch64::LD4Rv16b_POST:
490
2
    DestRegIdx = -1;
491
2
    BaseRegIdx = 2;
492
2
    OffsetIdx = 3;
493
2
    IsPrePost = true;
494
2
    break;
495
2
496
25
  case AArch64::LDRBBroW:
497
25
  case AArch64::LDRBBroX:
498
25
  case AArch64::LDRBBui:
499
25
  case AArch64::LDRBroW:
500
25
  case AArch64::LDRBroX:
501
25
  case AArch64::LDRBui:
502
25
  case AArch64::LDRDl:
503
25
  case AArch64::LDRDroW:
504
25
  case AArch64::LDRDroX:
505
25
  case AArch64::LDRDui:
506
25
  case AArch64::LDRHHroW:
507
25
  case AArch64::LDRHHroX:
508
25
  case AArch64::LDRHHui:
509
25
  case AArch64::LDRHroW:
510
25
  case AArch64::LDRHroX:
511
25
  case AArch64::LDRHui:
512
25
  case AArch64::LDRQl:
513
25
  case AArch64::LDRQroW:
514
25
  case AArch64::LDRQroX:
515
25
  case AArch64::LDRQui:
516
25
  case AArch64::LDRSBWroW:
517
25
  case AArch64::LDRSBWroX:
518
25
  case AArch64::LDRSBWui:
519
25
  case AArch64::LDRSBXroW:
520
25
  case AArch64::LDRSBXroX:
521
25
  case AArch64::LDRSBXui:
522
25
  case AArch64::LDRSHWroW:
523
25
  case AArch64::LDRSHWroX:
524
25
  case AArch64::LDRSHWui:
525
25
  case AArch64::LDRSHXroW:
526
25
  case AArch64::LDRSHXroX:
527
25
  case AArch64::LDRSHXui:
528
25
  case AArch64::LDRSWl:
529
25
  case AArch64::LDRSWroW:
530
25
  case AArch64::LDRSWroX:
531
25
  case AArch64::LDRSWui:
532
25
  case AArch64::LDRSl:
533
25
  case AArch64::LDRSroW:
534
25
  case AArch64::LDRSroX:
535
25
  case AArch64::LDRSui:
536
25
  case AArch64::LDRWl:
537
25
  case AArch64::LDRWroW:
538
25
  case AArch64::LDRWroX:
539
25
  case AArch64::LDRWui:
540
25
  case AArch64::LDRXl:
541
25
  case AArch64::LDRXroW:
542
25
  case AArch64::LDRXroX:
543
25
  case AArch64::LDRXui:
544
25
  case AArch64::LDURBBi:
545
25
  case AArch64::LDURBi:
546
25
  case AArch64::LDURDi:
547
25
  case AArch64::LDURHHi:
548
25
  case AArch64::LDURHi:
549
25
  case AArch64::LDURQi:
550
25
  case AArch64::LDURSBWi:
551
25
  case AArch64::LDURSBXi:
552
25
  case AArch64::LDURSHWi:
553
25
  case AArch64::LDURSHXi:
554
25
  case AArch64::LDURSWi:
555
25
  case AArch64::LDURSi:
556
25
  case AArch64::LDURWi:
557
25
  case AArch64::LDURXi:
558
25
    DestRegIdx = 0;
559
25
    BaseRegIdx = 1;
560
25
    OffsetIdx = 2;
561
25
    IsPrePost = false;
562
25
    break;
563
25
564
25
  case AArch64::LDRBBpost:
565
2
  case AArch64::LDRBBpre:
566
2
  case AArch64::LDRBpost:
567
2
  case AArch64::LDRBpre:
568
2
  case AArch64::LDRDpost:
569
2
  case AArch64::LDRDpre:
570
2
  case AArch64::LDRHHpost:
571
2
  case AArch64::LDRHHpre:
572
2
  case AArch64::LDRHpost:
573
2
  case AArch64::LDRHpre:
574
2
  case AArch64::LDRQpost:
575
2
  case AArch64::LDRQpre:
576
2
  case AArch64::LDRSBWpost:
577
2
  case AArch64::LDRSBWpre:
578
2
  case AArch64::LDRSBXpost:
579
2
  case AArch64::LDRSBXpre:
580
2
  case AArch64::LDRSHWpost:
581
2
  case AArch64::LDRSHWpre:
582
2
  case AArch64::LDRSHXpost:
583
2
  case AArch64::LDRSHXpre:
584
2
  case AArch64::LDRSWpost:
585
2
  case AArch64::LDRSWpre:
586
2
  case AArch64::LDRSpost:
587
2
  case AArch64::LDRSpre:
588
2
  case AArch64::LDRWpost:
589
2
  case AArch64::LDRWpre:
590
2
  case AArch64::LDRXpost:
591
2
  case AArch64::LDRXpre:
592
2
    DestRegIdx = 1;
593
2
    BaseRegIdx = 2;
594
2
    OffsetIdx = 3;
595
2
    IsPrePost = true;
596
2
    break;
597
2
598
2
  case AArch64::LDNPDi:
599
2
  case AArch64::LDNPQi:
600
2
  case AArch64::LDNPSi:
601
2
  case AArch64::LDPQi:
602
2
  case AArch64::LDPDi:
603
2
  case AArch64::LDPSi:
604
2
    DestRegIdx = -1;
605
2
    BaseRegIdx = 2;
606
2
    OffsetIdx = 3;
607
2
    IsPrePost = false;
608
2
    break;
609
2
610
10
  case AArch64::LDPSWi:
611
10
  case AArch64::LDPWi:
612
10
  case AArch64::LDPXi:
613
10
    DestRegIdx = 0;
614
10
    BaseRegIdx = 2;
615
10
    OffsetIdx = 3;
616
10
    IsPrePost = false;
617
10
    break;
618
10
619
10
  case AArch64::LDPQpost:
620
2
  case AArch64::LDPQpre:
621
2
  case AArch64::LDPDpost:
622
2
  case AArch64::LDPDpre:
623
2
  case AArch64::LDPSpost:
624
2
  case AArch64::LDPSpre:
625
2
    DestRegIdx = -1;
626
2
    BaseRegIdx = 3;
627
2
    OffsetIdx = 4;
628
2
    IsPrePost = true;
629
2
    break;
630
2
631
2
  case AArch64::LDPSWpost:
632
2
  case AArch64::LDPSWpre:
633
2
  case AArch64::LDPWpost:
634
2
  case AArch64::LDPWpre:
635
2
  case AArch64::LDPXpost:
636
2
  case AArch64::LDPXpre:
637
2
    DestRegIdx = 1;
638
2
    BaseRegIdx = 3;
639
2
    OffsetIdx = 4;
640
2
    IsPrePost = true;
641
2
    break;
642
59
  }
643
59
644
59
  // Loads from the stack pointer don't get prefetched.
645
59
  unsigned BaseReg = MI.getOperand(BaseRegIdx).getReg();
646
59
  if (BaseReg == AArch64::SP || 
BaseReg == AArch64::WSP58
)
647
1
    return None;
648
58
649
58
  LoadInfo LI;
650
58
  LI.DestReg = DestRegIdx == -1 ? 
Register()12
:
MI.getOperand(DestRegIdx).getReg()46
;
651
58
  LI.BaseReg = BaseReg;
652
58
  LI.BaseRegIdx = BaseRegIdx;
653
58
  LI.OffsetOpnd = OffsetIdx == -1 ? 
nullptr8
:
&MI.getOperand(OffsetIdx)50
;
654
58
  LI.IsPrePost = IsPrePost;
655
58
  return LI;
656
58
}
657
658
static Optional<unsigned> getTag(const TargetRegisterInfo *TRI,
659
76
                                 const MachineInstr &MI, const LoadInfo &LI) {
660
76
  unsigned Dest = LI.DestReg ? 
TRI->getEncodingValue(LI.DestReg)58
:
018
;
661
76
  unsigned Base = TRI->getEncodingValue(LI.BaseReg);
662
76
  unsigned Off;
663
76
  if (LI.OffsetOpnd == nullptr)
664
12
    Off = 0;
665
64
  else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() ||
666
64
           LI.OffsetOpnd->isCPI())
667
0
    return None;
668
64
  else if (LI.OffsetOpnd->isReg())
669
17
    Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg());
670
47
  else
671
47
    Off = LI.OffsetOpnd->getImm() >> 2;
672
76
673
76
  return makeTag(Dest, Base, Off);
674
76
}
675
676
19
void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) {
677
19
  // Build the initial tag map for the whole loop.
678
19
  TagMap.clear();
679
19
  for (MachineBasicBlock *MBB : L.getBlocks())
680
110
    
for (MachineInstr &MI : *MBB)19
{
681
110
      Optional<LoadInfo> LInfo = getLoadInfo(MI);
682
110
      if (!LInfo)
683
73
        continue;
684
37
      Optional<unsigned> Tag = getTag(TRI, MI, *LInfo);
685
37
      if (!Tag)
686
0
        continue;
687
37
      TagMap[*Tag].push_back(&MI);
688
37
    }
689
19
690
19
  bool AnyCollisions = false;
691
19
  for (auto &P : TagMap) {
692
18
    auto Size = P.second.size();
693
18
    if (Size > 1) {
694
17
      for (auto *MI : P.second) {
695
17
        if (TII->isStridedAccess(*MI)) {
696
17
          AnyCollisions = true;
697
17
          break;
698
17
        }
699
17
      }
700
17
    }
701
18
    if (AnyCollisions)
702
17
      break;
703
18
  }
704
19
  // Nothing to fix.
705
19
  if (!AnyCollisions)
706
2
    return;
707
17
708
17
  MachineRegisterInfo &MRI = Fn.getRegInfo();
709
17
710
17
  // Go through all the basic blocks in the current loop and fix any streaming
711
17
  // loads to avoid collisions with any other loads.
712
17
  LiveRegUnits LR(*TRI);
713
17
  for (MachineBasicBlock *MBB : L.getBlocks()) {
714
17
    LR.clear();
715
17
    LR.addLiveOuts(*MBB);
716
133
    for (auto I = MBB->rbegin(); I != MBB->rend(); 
LR.stepBackward(*I), ++I116
) {
717
116
      MachineInstr &MI = *I;
718
116
      if (!TII->isStridedAccess(MI))
719
94
        continue;
720
22
721
22
      Optional<LoadInfo> OptLdI = getLoadInfo(MI);
722
22
      if (!OptLdI)
723
1
        continue;
724
21
      LoadInfo LdI = *OptLdI;
725
21
      Optional<unsigned> OptOldTag = getTag(TRI, MI, LdI);
726
21
      if (!OptOldTag)
727
0
        continue;
728
21
      auto &OldCollisions = TagMap[*OptOldTag];
729
21
      if (OldCollisions.size() <= 1)
730
3
        continue;
731
18
732
18
      bool Fixed = false;
733
18
      LLVM_DEBUG(dbgs() << "Attempting to fix tag collision: " << MI);
734
18
735
18
      if (!DebugCounter::shouldExecute(FixCounter)) {
736
0
        LLVM_DEBUG(dbgs() << "Skipping fix due to debug counter:\n  " << MI);
737
0
        continue;
738
0
      }
739
18
740
18
      // Add the non-base registers of MI as live so we don't use them as
741
18
      // scratch registers.
742
91
      
for (unsigned OpI = 0, OpE = MI.getNumOperands(); 18
OpI < OpE;
++OpI73
) {
743
73
        if (OpI == static_cast<unsigned>(LdI.BaseRegIdx))
744
18
          continue;
745
55
        MachineOperand &MO = MI.getOperand(OpI);
746
55
        if (MO.isReg() && 
MO.readsReg()40
)
747
9
          LR.addReg(MO.getReg());
748
55
      }
749
18
750
43
      for (unsigned ScratchReg : AArch64::GPR64RegClass) {
751
43
        if (!LR.available(ScratchReg) || 
MRI.isReserved(ScratchReg)18
)
752
25
          continue;
753
18
754
18
        LoadInfo NewLdI(LdI);
755
18
        NewLdI.BaseReg = ScratchReg;
756
18
        unsigned NewTag = *getTag(TRI, MI, NewLdI);
757
18
        // Scratch reg tag would collide too, so don't use it.
758
18
        if (TagMap.count(NewTag))
759
0
          continue;
760
18
761
18
        LLVM_DEBUG(dbgs() << "Changing base reg to: "
762
18
                          << printReg(ScratchReg, TRI) << '\n');
763
18
764
18
        // Rewrite:
765
18
        //   Xd = LOAD Xb, off
766
18
        // to:
767
18
        //   Xc = MOV Xb
768
18
        //   Xd = LOAD Xc, off
769
18
        DebugLoc DL = MI.getDebugLoc();
770
18
        BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg)
771
18
            .addReg(AArch64::XZR)
772
18
            .addReg(LdI.BaseReg)
773
18
            .addImm(0);
774
18
        MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx);
775
18
        BaseOpnd.setReg(ScratchReg);
776
18
777
18
        // If the load does a pre/post increment, then insert a MOV after as
778
18
        // well to update the real base register.
779
18
        if (LdI.IsPrePost) {
780
7
          LLVM_DEBUG(dbgs() << "Doing post MOV of incremented reg: "
781
7
                            << printReg(ScratchReg, TRI) << '\n');
782
7
          MI.getOperand(0).setReg(
783
7
              ScratchReg); // Change tied operand pre/post update dest.
784
7
          BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL,
785
7
                  TII->get(AArch64::ORRXrs), LdI.BaseReg)
786
7
              .addReg(AArch64::XZR)
787
7
              .addReg(ScratchReg)
788
7
              .addImm(0);
789
7
        }
790
18
791
21
        for (int I = 0, E = OldCollisions.size(); I != E; 
++I3
)
792
21
          if (OldCollisions[I] == &MI) {
793
18
            std::swap(OldCollisions[I], OldCollisions[E - 1]);
794
18
            OldCollisions.pop_back();
795
18
            break;
796
18
          }
797
18
798
18
        // Update TagMap to reflect instruction changes to reduce the number
799
18
        // of later MOVs to be inserted.  This needs to be done after
800
18
        // OldCollisions is updated since it may be relocated by this
801
18
        // insertion.
802
18
        TagMap[NewTag].push_back(&MI);
803
18
        ++NumCollisionsAvoided;
804
18
        Fixed = true;
805
18
        Modified = true;
806
18
        break;
807
18
      }
808
18
      if (!Fixed)
809
0
        ++NumCollisionsNotAvoided;
810
18
    }
811
17
  }
812
17
}
813
814
257k
bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
815
257k
  auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
816
257k
  if (ST.getProcFamily() != AArch64Subtarget::Falkor)
817
257k
    return false;
818
83
819
83
  if (skipFunction(Fn.getFunction()))
820
0
    return false;
821
83
822
83
  TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
823
83
  TRI = ST.getRegisterInfo();
824
83
825
83
  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
826
83
         "Register liveness not available!");
827
83
828
83
  MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
829
83
830
83
  Modified = false;
831
83
832
83
  for (MachineLoop *I : LI)
833
38
    
for (auto L = df_begin(I), LE = df_end(I); 19
L != LE;
++L19
)
834
19
      // Only process inner-loops
835
19
      if (L->empty())
836
19
        runOnLoop(**L, Fn);
837
83
838
83
  return Modified;
839
83
}
840
841
8.62k
FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); }