Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- AArch64A57FPLoadBalancing.cpp - Balance FP ops statically on A57---===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
// For best-case performance on Cortex-A57, we should try to use a balanced
9
// mix of odd and even D-registers when performing a critical sequence of
10
// independent, non-quadword FP/ASIMD floating-point multiply or
11
// multiply-accumulate operations.
12
//
13
// This pass attempts to detect situations where the register allocation may
14
// adversely affect this load balancing and to change the registers used so as
15
// to better utilize the CPU.
16
//
17
// Ideally we'd just take each multiply or multiply-accumulate in turn and
18
// allocate it alternating even or odd registers. However, multiply-accumulates
19
// are most efficiently performed in the same functional unit as their
20
// accumulation operand. Therefore this pass tries to find maximal sequences
21
// ("Chains") of multiply-accumulates linked via their accumulation operand,
22
// and assign them all the same "color" (oddness/evenness).
23
//
24
// This optimization affects S-register and D-register floating point
25
// multiplies and FMADD/FMAs, as well as vector (floating point only) muls and
26
// FMADD/FMA. Q register instructions (and 128-bit vector instructions) are
27
// not affected.
28
//===----------------------------------------------------------------------===//
29
30
#include "AArch64.h"
31
#include "AArch64InstrInfo.h"
32
#include "AArch64Subtarget.h"
33
#include "llvm/ADT/BitVector.h"
34
#include "llvm/ADT/EquivalenceClasses.h"
35
#include "llvm/CodeGen/MachineFunction.h"
36
#include "llvm/CodeGen/MachineFunctionPass.h"
37
#include "llvm/CodeGen/MachineInstr.h"
38
#include "llvm/CodeGen/MachineInstrBuilder.h"
39
#include "llvm/CodeGen/MachineRegisterInfo.h"
40
#include "llvm/CodeGen/RegisterClassInfo.h"
41
#include "llvm/CodeGen/RegisterScavenging.h"
42
#include "llvm/Support/CommandLine.h"
43
#include "llvm/Support/Debug.h"
44
#include "llvm/Support/raw_ostream.h"
45
using namespace llvm;
46
47
#define DEBUG_TYPE "aarch64-a57-fp-load-balancing"
48
49
// Enforce the algorithm to use the scavenged register even when the original
50
// destination register is the correct color. Used for testing.
51
static cl::opt<bool>
52
TransformAll("aarch64-a57-fp-load-balancing-force-all",
53
             cl::desc("Always modify dest registers regardless of color"),
54
             cl::init(false), cl::Hidden);
55
56
// Never use the balance information obtained from chains - return a specific
57
// color always. Used for testing.
58
static cl::opt<unsigned>
59
OverrideBalance("aarch64-a57-fp-load-balancing-override",
60
              cl::desc("Ignore balance information, always return "
61
                       "(1: Even, 2: Odd)."),
62
              cl::init(0), cl::Hidden);
63
64
//===----------------------------------------------------------------------===//
65
// Helper functions
66
67
// Is the instruction a type of multiply on 64-bit (or 32-bit) FPRs?
68
1.76k
static bool isMul(MachineInstr *MI) {
69
1.76k
  switch (MI->getOpcode()) {
70
1.76k
  case AArch64::FMULSrr:
71
77
  case AArch64::FNMULSrr:
72
77
  case AArch64::FMULDrr:
73
77
  case AArch64::FNMULDrr:
74
77
    return true;
75
1.69k
  default:
76
1.69k
    return false;
77
1.76k
  }
78
1.76k
}
79
80
// Is the instruction a type of FP multiply-accumulate on 64-bit (or 32-bit) FPRs?
81
1.69k
static bool isMla(MachineInstr *MI) {
82
1.69k
  switch (MI->getOpcode()) {
83
1.69k
  case AArch64::FMSUBSrrr:
84
204
  case AArch64::FMADDSrrr:
85
204
  case AArch64::FNMSUBSrrr:
86
204
  case AArch64::FNMADDSrrr:
87
204
  case AArch64::FMSUBDrrr:
88
204
  case AArch64::FMADDDrrr:
89
204
  case AArch64::FNMSUBDrrr:
90
204
  case AArch64::FNMADDDrrr:
91
204
    return true;
92
1.48k
  default:
93
1.48k
    return false;
94
1.69k
  }
95
1.69k
}
96
97
//===----------------------------------------------------------------------===//
98
99
namespace {
100
/// A "color", which is either even or odd. Yes, these aren't really colors
101
/// but the algorithm is conceptually doing two-color graph coloring.
102
enum class Color { Even, Odd };
103
#ifndef NDEBUG
104
static const char *ColorNames[2] = { "Even", "Odd" };
105
#endif
106
107
class Chain;
108
109
class AArch64A57FPLoadBalancing : public MachineFunctionPass {
110
  MachineRegisterInfo *MRI;
111
  const TargetRegisterInfo *TRI;
112
  RegisterClassInfo RCI;
113
114
public:
115
  static char ID;
116
8.61k
  explicit AArch64A57FPLoadBalancing() : MachineFunctionPass(ID) {
117
8.61k
    initializeAArch64A57FPLoadBalancingPass(*PassRegistry::getPassRegistry());
118
8.61k
  }
119
120
  bool runOnMachineFunction(MachineFunction &F) override;
121
122
8.57k
  MachineFunctionProperties getRequiredProperties() const override {
123
8.57k
    return MachineFunctionProperties().set(
124
8.57k
        MachineFunctionProperties::Property::NoVRegs);
125
8.57k
  }
126
127
265k
  StringRef getPassName() const override {
128
265k
    return "A57 FP Anti-dependency breaker";
129
265k
  }
130
131
8.57k
  void getAnalysisUsage(AnalysisUsage &AU) const override {
132
8.57k
    AU.setPreservesCFG();
133
8.57k
    MachineFunctionPass::getAnalysisUsage(AU);
134
8.57k
  }
135
136
private:
137
  bool runOnBasicBlock(MachineBasicBlock &MBB);
138
  bool colorChainSet(std::vector<Chain*> GV, MachineBasicBlock &MBB,
139
                     int &Balance);
140
  bool colorChain(Chain *G, Color C, MachineBasicBlock &MBB);
141
  int scavengeRegister(Chain *G, Color C, MachineBasicBlock &MBB);
142
  void scanInstruction(MachineInstr *MI, unsigned Idx,
143
                       std::map<unsigned, Chain*> &Active,
144
                       std::vector<std::unique_ptr<Chain>> &AllChains);
145
  void maybeKillChain(MachineOperand &MO, unsigned Idx,
146
                      std::map<unsigned, Chain*> &RegChains);
147
  Color getColor(unsigned Register);
148
  Chain *getAndEraseNext(Color PreferredColor, std::vector<Chain*> &L);
149
};
150
}
151
152
char AArch64A57FPLoadBalancing::ID = 0;
153
154
101k
INITIALIZE_PASS_BEGIN(AArch64A57FPLoadBalancing, DEBUG_TYPE,
155
101k
                      "AArch64 A57 FP Load-Balancing", false, false)
156
101k
INITIALIZE_PASS_END(AArch64A57FPLoadBalancing, DEBUG_TYPE,
157
                    "AArch64 A57 FP Load-Balancing", false, false)
158
159
namespace {
160
/// A Chain is a sequence of instructions that are linked together by
161
/// an accumulation operand. For example:
162
///
163
///   fmul def d0, ?
164
///   fmla def d1, ?, ?, killed d0
165
///   fmla def d2, ?, ?, killed d1
166
///
167
/// There may be other instructions interleaved in the sequence that
168
/// do not belong to the chain. These other instructions must not use
169
/// the "chain" register at any point.
170
///
171
/// We currently only support chains where the "chain" operand is killed
172
/// at each link in the chain for simplicity.
173
/// A chain has three important instructions - Start, Last and Kill.
174
///   * The start instruction is the first instruction in the chain.
175
///   * Last is the final instruction in the chain.
176
///   * Kill may or may not be defined. If defined, Kill is the instruction
177
///     where the outgoing value of the Last instruction is killed.
178
///     This information is important as if we know the outgoing value is
179
///     killed with no intervening uses, we can safely change its register.
180
///
181
/// Without a kill instruction, we must assume the outgoing value escapes
182
/// beyond our model and either must not change its register or must
183
/// create a fixup FMOV to keep the old register value consistent.
184
///
185
class Chain {
186
public:
187
  /// The important (marker) instructions.
188
  MachineInstr *StartInst, *LastInst, *KillInst;
189
  /// The index, from the start of the basic block, that each marker
190
  /// appears. These are stored so we can do quick interval tests.
191
  unsigned StartInstIdx, LastInstIdx, KillInstIdx;
192
  /// All instructions in the chain.
193
  std::set<MachineInstr*> Insts;
194
  /// True if KillInst cannot be modified. If this is true,
195
  /// we cannot change LastInst's outgoing register.
196
  /// This will be true for tied values and regmasks.
197
  bool KillIsImmutable;
198
  /// The "color" of LastInst. This will be the preferred chain color,
199
  /// as changing intermediate nodes is easy but changing the last
200
  /// instruction can be more tricky.
201
  Color LastColor;
202
203
  Chain(MachineInstr *MI, unsigned Idx, Color C)
204
      : StartInst(MI), LastInst(MI), KillInst(nullptr),
205
        StartInstIdx(Idx), LastInstIdx(Idx), KillInstIdx(0),
206
119
        LastColor(C) {
207
119
    Insts.insert(MI);
208
119
  }
209
210
  /// Add a new instruction into the chain. The instruction's dest operand
211
  /// has the given color.
212
162
  void add(MachineInstr *MI, unsigned Idx, Color C) {
213
162
    LastInst = MI;
214
162
    LastInstIdx = Idx;
215
162
    LastColor = C;
216
162
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
217
162
           "Chain: broken invariant. A Chain can only be killed after its last "
218
162
           "def");
219
162
220
162
    Insts.insert(MI);
221
162
  }
222
223
  /// Return true if MI is a member of the chain.
224
424
  bool contains(MachineInstr &MI) { return Insts.count(&MI) > 0; }
225
226
  /// Return the number of instructions in the chain.
227
423
  unsigned size() const {
228
423
    return Insts.size();
229
423
  }
230
231
  /// Inform the chain that its last active register (the dest register of
232
  /// LastInst) is killed by MI with no intervening uses or defs.
233
53
  void setKill(MachineInstr *MI, unsigned Idx, bool Immutable) {
234
53
    KillInst = MI;
235
53
    KillInstIdx = Idx;
236
53
    KillIsImmutable = Immutable;
237
53
    assert((KillInstIdx == 0 || LastInstIdx < KillInstIdx) &&
238
53
           "Chain: broken invariant. A Chain can only be killed after its last "
239
53
           "def");
240
53
  }
241
242
  /// Return the first instruction in the chain.
243
238
  MachineInstr *getStart() const { return StartInst; }
244
  /// Return the last instruction in the chain.
245
126
  MachineInstr *getLast() const { return LastInst; }
246
  /// Return the "kill" instruction (as set with setKill()) or NULL.
247
1.39k
  MachineInstr *getKill() const { return KillInst; }
248
  /// Return an instruction that can be used as an iterator for the end
249
  /// of the chain. This is the maximum of KillInst (if set) and LastInst.
250
238
  MachineBasicBlock::iterator end() const {
251
238
    return ++MachineBasicBlock::iterator(KillInst ? 
KillInst106
:
LastInst132
);
252
238
  }
253
238
  MachineBasicBlock::iterator begin() const { return getStart(); }
254
255
  /// Can the Kill instruction (assuming one exists) be modified?
256
293
  bool isKillImmutable() const { return KillIsImmutable; }
257
258
  /// Return the preferred color of this chain.
259
292
  Color getPreferredColor() {
260
292
    if (OverrideBalance != 0)
261
120
      return OverrideBalance == 1 ? 
Color::Even60
:
Color::Odd60
;
262
172
    return LastColor;
263
172
  }
264
265
  /// Return true if this chain (StartInst..KillInst) overlaps with Other.
266
44
  bool rangeOverlapsWith(const Chain &Other) const {
267
44
    unsigned End = KillInst ? 
KillInstIdx40
:
LastInstIdx4
;
268
44
    unsigned OtherEnd = Other.KillInst ?
269
40
      Other.KillInstIdx : 
Other.LastInstIdx4
;
270
44
271
44
    return StartInstIdx <= OtherEnd && 
Other.StartInstIdx <= End38
;
272
44
  }
273
274
  /// Return true if this chain starts before Other.
275
6
  bool startsBefore(const Chain *Other) const {
276
6
    return StartInstIdx < Other->StartInstIdx;
277
6
  }
278
279
  /// Return true if the group will require a fixup MOV at the end.
280
416
  bool requiresFixup() const {
281
416
    return (getKill() && 
isKillImmutable()240
) ||
!getKill()386
;
282
416
  }
283
284
  /// Return a simple string representation of the chain.
285
0
  std::string str() const {
286
0
    std::string S;
287
0
    raw_string_ostream OS(S);
288
0
289
0
    OS << "{";
290
0
    StartInst->print(OS, /* SkipOpers= */true);
291
0
    OS << " -> ";
292
0
    LastInst->print(OS, /* SkipOpers= */true);
293
0
    if (KillInst) {
294
0
      OS << " (kill @ ";
295
0
      KillInst->print(OS, /* SkipOpers= */true);
296
0
      OS << ")";
297
0
    }
298
0
    OS << "}";
299
0
300
0
    return OS.str();
301
0
  }
302
303
};
304
305
} // end anonymous namespace
306
307
//===----------------------------------------------------------------------===//
308
309
257k
bool AArch64A57FPLoadBalancing::runOnMachineFunction(MachineFunction &F) {
310
257k
  if (skipFunction(F.getFunction()))
311
16
    return false;
312
257k
313
257k
  if (!F.getSubtarget<AArch64Subtarget>().balanceFPOps())
314
256k
    return false;
315
200
316
200
  bool Changed = false;
317
200
  LLVM_DEBUG(dbgs() << "***** AArch64A57FPLoadBalancing *****\n");
318
200
319
200
  MRI = &F.getRegInfo();
320
200
  TRI = F.getRegInfo().getTargetRegisterInfo();
321
200
  RCI.runOnMachineFunction(F);
322
200
323
256
  for (auto &MBB : F) {
324
256
    Changed |= runOnBasicBlock(MBB);
325
256
  }
326
200
327
200
  return Changed;
328
200
}
329
330
256
bool AArch64A57FPLoadBalancing::runOnBasicBlock(MachineBasicBlock &MBB) {
331
256
  bool Changed = false;
332
256
  LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB
333
256
                    << " - scanning instructions...\n");
334
256
335
256
  // First, scan the basic block producing a set of chains.
336
256
337
256
  // The currently "active" chains - chains that can be added to and haven't
338
256
  // been killed yet. This is keyed by register - all chains can only have one
339
256
  // "link" register between each inst in the chain.
340
256
  std::map<unsigned, Chain*> ActiveChains;
341
256
  std::vector<std::unique_ptr<Chain>> AllChains;
342
256
  unsigned Idx = 0;
343
256
  for (auto &MI : MBB)
344
1.76k
    scanInstruction(&MI, Idx++, ActiveChains, AllChains);
345
256
346
256
  LLVM_DEBUG(dbgs() << "Scan complete, " << AllChains.size()
347
256
                    << " chains created.\n");
348
256
349
256
  // Group the chains into disjoint sets based on their liveness range. This is
350
256
  // a poor-man's version of graph coloring. Ideally we'd create an interference
351
256
  // graph and perform full-on graph coloring on that, but;
352
256
  //   (a) That's rather heavyweight for only two colors.
353
256
  //   (b) We expect multiple disjoint interference regions - in practice the live
354
256
  //       range of chains is quite small and they are clustered between loads
355
256
  //       and stores.
356
256
  EquivalenceClasses<Chain*> EC;
357
256
  for (auto &I : AllChains)
358
119
    EC.insert(I.get());
359
256
360
256
  for (auto &I : AllChains)
361
119
    for (auto &J : AllChains)
362
163
      if (I != J && 
I->rangeOverlapsWith(*J)44
)
363
32
        EC.unionSets(I.get(), J.get());
364
256
  LLVM_DEBUG(dbgs() << "Created " << EC.getNumClasses() << " disjoint sets.\n");
365
256
366
256
  // Now we assume that every member of an equivalence class interferes
367
256
  // with every other member of that class, and with no members of other classes.
368
256
369
256
  // Convert the EquivalenceClasses to a simpler set of sets.
370
256
  std::vector<std::vector<Chain*> > V;
371
375
  for (auto I = EC.begin(), E = EC.end(); I != E; 
++I119
) {
372
119
    std::vector<Chain*> Cs(EC.member_begin(I), EC.member_end());
373
119
    if (Cs.empty()) 
continue16
;
374
103
    V.push_back(std::move(Cs));
375
103
  }
376
256
377
256
  // Now we have a set of sets, order them by start address so
378
256
  // we can iterate over them sequentially.
379
256
  llvm::sort(V,
380
256
             [](const std::vector<Chain *> &A, const std::vector<Chain *> &B) {
381
6
               return A.front()->startsBefore(B.front());
382
6
             });
383
256
384
256
  // As we only have two colors, we can track the global (BB-level) balance of
385
256
  // odds versus evens. We aim to keep this near zero to keep both execution
386
256
  // units fed.
387
256
  // Positive means we're even-heavy, negative we're odd-heavy.
388
256
  //
389
256
  // FIXME: If chains have interdependencies, for example:
390
256
  //   mul r0, r1, r2
391
256
  //   mul r3, r0, r1
392
256
  // We do not model this and may color each one differently, assuming we'll
393
256
  // get ILP when we obviously can't. This hasn't been seen to be a problem
394
256
  // in practice so far, so we simplify the algorithm by ignoring it.
395
256
  int Parity = 0;
396
256
397
256
  for (auto &I : V)
398
103
    Changed |= colorChainSet(std::move(I), MBB, Parity);
399
256
400
256
  return Changed;
401
256
}
402
403
Chain *AArch64A57FPLoadBalancing::getAndEraseNext(Color PreferredColor,
404
222
                                                  std::vector<Chain*> &L) {
405
222
  if (L.empty())
406
103
    return nullptr;
407
119
408
119
  // We try and get the best candidate from L to color next, given that our
409
119
  // preferred color is "PreferredColor". L is ordered from larger to smaller
410
119
  // chains. It is beneficial to color the large chains before the small chains,
411
119
  // but if we can't find a chain of the maximum length with the preferred color,
412
119
  // we fuzz the size and look for slightly smaller chains before giving up and
413
119
  // returning a chain that must be recolored.
414
119
415
119
  // FIXME: Does this need to be configurable?
416
119
  const unsigned SizeFuzz = 1;
417
119
  unsigned MinSize = L.front()->size() - SizeFuzz;
418
213
  for (auto I = L.begin(), E = L.end(); I != E; 
++I94
) {
419
129
    if ((*I)->size() <= MinSize) {
420
6
      // We've gone past the size limit. Return the previous item.
421
6
      Chain *Ch = *--I;
422
6
      L.erase(I);
423
6
      return Ch;
424
6
    }
425
123
426
123
    if ((*I)->getPreferredColor() == PreferredColor) {
427
29
      Chain *Ch = *I;
428
29
      L.erase(I);
429
29
      return Ch;
430
29
    }
431
123
  }
432
119
433
119
  // Bailout case - just return the first item.
434
119
  Chain *Ch = L.front();
435
84
  L.erase(L.begin());
436
84
  return Ch;
437
119
}
438
439
bool AArch64A57FPLoadBalancing::colorChainSet(std::vector<Chain*> GV,
440
                                              MachineBasicBlock &MBB,
441
103
                                              int &Parity) {
442
103
  bool Changed = false;
443
103
  LLVM_DEBUG(dbgs() << "colorChainSet(): #sets=" << GV.size() << "\n");
444
103
445
103
  // Sort by descending size order so that we allocate the most important
446
103
  // sets first.
447
103
  // Tie-break equivalent sizes by sorting chains requiring fixups before
448
103
  // those without fixups. The logic here is that we should look at the
449
103
  // chains that we cannot change before we look at those we can,
450
103
  // so the parity counter is updated and we know what color we should
451
103
  // change them to!
452
103
  // Final tie-break with instruction order so pass output is stable (i.e. not
453
103
  // dependent on malloc'd pointer values).
454
103
  llvm::sort(GV, [](const Chain *G1, const Chain *G2) {
455
16
    if (G1->size() != G2->size())
456
12
      return G1->size() > G2->size();
457
4
    if (G1->requiresFixup() != G2->requiresFixup())
458
4
      return G1->requiresFixup() > G2->requiresFixup();
459
0
    // Make sure startsBefore() produces a stable final order.
460
0
    assert((G1 == G2 || (G1->startsBefore(G2) ^ G2->startsBefore(G1))) &&
461
0
           "Starts before not total order!");
462
0
    return G1->startsBefore(G2);
463
0
  });
464
103
465
103
  Color PreferredColor = Parity < 0 ? 
Color::Even3
:
Color::Odd100
;
466
222
  while (Chain *G = getAndEraseNext(PreferredColor, GV)) {
467
119
    // Start off by assuming we'll color to our own preferred color.
468
119
    Color C = PreferredColor;
469
119
    if (Parity == 0)
470
97
      // But if we really don't care, use the chain's preferred color.
471
97
      C = G->getPreferredColor();
472
119
473
119
    LLVM_DEBUG(dbgs() << " - Parity=" << Parity
474
119
                      << ", Color=" << ColorNames[(int)C] << "\n");
475
119
476
119
    // If we'll need a fixup FMOV, don't bother. Testing has shown that this
477
119
    // happens infrequently and when it does it has at least a 50% chance of
478
119
    // slowing code down instead of speeding it up.
479
119
    if (G->requiresFixup() && 
C != G->getPreferredColor()72
) {
480
0
      C = G->getPreferredColor();
481
0
      LLVM_DEBUG(dbgs() << " - " << G->str()
482
0
                        << " - not worthwhile changing; "
483
0
                           "color remains "
484
0
                        << ColorNames[(int)C] << "\n");
485
0
    }
486
119
487
119
    Changed |= colorChain(G, C, MBB);
488
119
489
119
    Parity += (C == Color::Even) ? 
G->size()85
:
-G->size()34
;
490
119
    PreferredColor = Parity < 0 ? 
Color::Even34
:
Color::Odd85
;
491
119
  }
492
103
493
103
  return Changed;
494
103
}
495
496
int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C,
497
119
                                                MachineBasicBlock &MBB) {
498
119
  // Can we find an appropriate register that is available throughout the life
499
119
  // of the chain? Simulate liveness backwards until the end of the chain.
500
119
  LiveRegUnits Units(*TRI);
501
119
  Units.addLiveOuts(MBB);
502
119
  MachineBasicBlock::iterator I = MBB.end();
503
119
  MachineBasicBlock::iterator ChainEnd = G->end();
504
363
  while (I != ChainEnd) {
505
244
    --I;
506
244
    Units.stepBackward(*I);
507
244
  }
508
119
509
119
  // Check which register units are alive throughout the chain.
510
119
  MachineBasicBlock::iterator ChainBegin = G->begin();
511
119
  assert(ChainBegin != ChainEnd && "Chain should contain instructions");
512
424
  do {
513
424
    --I;
514
424
    Units.accumulate(*I);
515
424
  } while (I != ChainBegin);
516
119
517
119
  // Make sure we allocate in-order, to get the cheapest registers first.
518
119
  unsigned RegClassID = ChainBegin->getDesc().OpInfo[0].RegClass;
519
119
  auto Ord = RCI.getOrder(TRI->getRegClass(RegClassID));
520
701
  for (auto Reg : Ord) {
521
701
    if (!Units.available(Reg))
522
497
      continue;
523
204
    if (C == getColor(Reg))
524
119
      return Reg;
525
204
  }
526
119
527
119
  
return -10
;
528
119
}
529
530
bool AArch64A57FPLoadBalancing::colorChain(Chain *G, Color C,
531
119
                                           MachineBasicBlock &MBB) {
532
119
  bool Changed = false;
533
119
  LLVM_DEBUG(dbgs() << " - colorChain(" << G->str() << ", "
534
119
                    << ColorNames[(int)C] << ")\n");
535
119
536
119
  // Try and obtain a free register of the right class. Without a register
537
119
  // to play with we cannot continue.
538
119
  int Reg = scavengeRegister(G, C, MBB);
539
119
  if (Reg == -1) {
540
0
    LLVM_DEBUG(dbgs() << "Scavenging (thus coloring) failed!\n");
541
0
    return false;
542
0
  }
543
119
  LLVM_DEBUG(dbgs() << " - Scavenged register: " << printReg(Reg, TRI) << "\n");
544
119
545
119
  std::map<unsigned, unsigned> Substs;
546
424
  for (MachineInstr &I : *G) {
547
424
    if (!G->contains(I) && 
(143
&I != G->getKill()143
||
G->isKillImmutable()53
))
548
96
      continue;
549
328
550
328
    // I is a member of G, or I is a mutable instruction that kills G.
551
328
552
328
    std::vector<unsigned> ToErase;
553
1.18k
    for (auto &U : I.operands()) {
554
1.18k
      if (U.isReg() && 
U.isUse()1.15k
&&
Substs.find(U.getReg()) != Substs.end()859
) {
555
210
        unsigned OrigReg = U.getReg();
556
210
        U.setReg(Substs[OrigReg]);
557
210
        if (U.isKill())
558
204
          // Don't erase straight away, because there may be other operands
559
204
          // that also reference this substitution!
560
204
          ToErase.push_back(OrigReg);
561
977
      } else if (U.isRegMask()) {
562
0
        for (auto J : Substs) {
563
0
          if (U.clobbersPhysReg(J.first))
564
0
            ToErase.push_back(J.first);
565
0
        }
566
0
      }
567
1.18k
    }
568
328
    // Now it's safe to remove the substs identified earlier.
569
328
    for (auto J : ToErase)
570
204
      Substs.erase(J);
571
328
572
328
    // Only change the def if this isn't the last instruction.
573
328
    if (&I != G->getKill()) {
574
281
      MachineOperand &MO = I.getOperand(0);
575
281
576
281
      bool Change = TransformAll || 
getColor(MO.getReg()) != C59
;
577
281
      if (G->requiresFixup() && 
&I == G->getLast()126
)
578
72
        Change = false;
579
281
580
281
      if (Change) {
581
204
        Substs[MO.getReg()] = Reg;
582
204
        MO.setReg(Reg);
583
204
584
204
        Changed = true;
585
204
      }
586
281
    }
587
328
  }
588
119
  assert(Substs.size() == 0 && "No substitutions should be left active!");
589
119
590
119
  if (G->getKill()) {
591
53
    LLVM_DEBUG(dbgs() << " - Kill instruction seen.\n");
592
66
  } else {
593
66
    // We didn't have a kill instruction, but we didn't seem to need to change
594
66
    // the destination register anyway.
595
66
    LLVM_DEBUG(dbgs() << " - Destination register not changed.\n");
596
66
  }
597
119
  return Changed;
598
119
}
599
600
void AArch64A57FPLoadBalancing::scanInstruction(
601
    MachineInstr *MI, unsigned Idx, std::map<unsigned, Chain *> &ActiveChains,
602
1.76k
    std::vector<std::unique_ptr<Chain>> &AllChains) {
603
1.76k
  // Inspect "MI", updating ActiveChains and AllChains.
604
1.76k
605
1.76k
  if (isMul(MI)) {
606
77
607
77
    for (auto &I : MI->uses())
608
154
      maybeKillChain(I, Idx, ActiveChains);
609
77
    for (auto &I : MI->defs())
610
77
      maybeKillChain(I, Idx, ActiveChains);
611
77
612
77
    // Create a new chain. Multiplies don't require forwarding so can go on any
613
77
    // unit.
614
77
    unsigned DestReg = MI->getOperand(0).getReg();
615
77
616
77
    LLVM_DEBUG(dbgs() << "New chain started for register "
617
77
                      << printReg(DestReg, TRI) << " at " << *MI);
618
77
619
77
    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
620
77
    ActiveChains[DestReg] = G.get();
621
77
    AllChains.push_back(std::move(G));
622
77
623
1.69k
  } else if (isMla(MI)) {
624
204
625
204
    // It is beneficial to keep MLAs on the same functional unit as their
626
204
    // accumulator operand.
627
204
    unsigned DestReg  = MI->getOperand(0).getReg();
628
204
    unsigned AccumReg = MI->getOperand(3).getReg();
629
204
630
204
    maybeKillChain(MI->getOperand(1), Idx, ActiveChains);
631
204
    maybeKillChain(MI->getOperand(2), Idx, ActiveChains);
632
204
    if (DestReg != AccumReg)
633
72
      maybeKillChain(MI->getOperand(0), Idx, ActiveChains);
634
204
635
204
    if (ActiveChains.find(AccumReg) != ActiveChains.end()) {
636
162
      LLVM_DEBUG(dbgs() << "Chain found for accumulator register "
637
162
                        << printReg(AccumReg, TRI) << " in MI " << *MI);
638
162
639
162
      // For simplicity we only chain together sequences of MULs/MLAs where the
640
162
      // accumulator register is killed on each instruction. This means we don't
641
162
      // need to track other uses of the registers we want to rewrite.
642
162
      //
643
162
      // FIXME: We could extend to handle the non-kill cases for more coverage.
644
162
      if (MI->getOperand(3).isKill()) {
645
162
        // Add to chain.
646
162
        LLVM_DEBUG(dbgs() << "Instruction was successfully added to chain.\n");
647
162
        ActiveChains[AccumReg]->add(MI, Idx, getColor(DestReg));
648
162
        // Handle cases where the destination is not the same as the accumulator.
649
162
        if (DestReg != AccumReg) {
650
36
          ActiveChains[DestReg] = ActiveChains[AccumReg];
651
36
          ActiveChains.erase(AccumReg);
652
36
        }
653
162
        return;
654
162
      }
655
0
656
0
      LLVM_DEBUG(
657
0
          dbgs() << "Cannot add to chain because accumulator operand wasn't "
658
0
                 << "marked <kill>!\n");
659
0
      maybeKillChain(MI->getOperand(3), Idx, ActiveChains);
660
0
    }
661
204
662
204
    
LLVM_DEBUG42
(dbgs() << "Creating new chain for dest register "
663
42
                      << printReg(DestReg, TRI) << "\n");
664
42
    auto G = llvm::make_unique<Chain>(MI, Idx, getColor(DestReg));
665
42
    ActiveChains[DestReg] = G.get();
666
42
    AllChains.push_back(std::move(G));
667
42
668
1.48k
  } else {
669
1.48k
670
1.48k
    // Non-MUL or MLA instruction. Invalidate any chain in the uses or defs
671
1.48k
    // lists.
672
1.48k
    for (auto &I : MI->uses())
673
3.29k
      maybeKillChain(I, Idx, ActiveChains);
674
1.48k
    for (auto &I : MI->defs())
675
880
      maybeKillChain(I, Idx, ActiveChains);
676
1.48k
677
1.48k
  }
678
1.76k
}
679
680
void AArch64A57FPLoadBalancing::
681
maybeKillChain(MachineOperand &MO, unsigned Idx,
682
4.88k
               std::map<unsigned, Chain*> &ActiveChains) {
683
4.88k
  // Given an operand and the set of active chains (keyed by register),
684
4.88k
  // determine if a chain should be ended and remove from ActiveChains.
685
4.88k
  MachineInstr *MI = MO.getParent();
686
4.88k
687
4.88k
  if (MO.isReg()) {
688
3.69k
689
3.69k
    // If this is a KILL of a current chain, record it.
690
3.69k
    if (MO.isKill() && 
ActiveChains.find(MO.getReg()) != ActiveChains.end()1.19k
) {
691
47
      LLVM_DEBUG(dbgs() << "Kill seen for chain " << printReg(MO.getReg(), TRI)
692
47
                        << "\n");
693
47
      ActiveChains[MO.getReg()]->setKill(MI, Idx, /*Immutable=*/MO.isTied());
694
47
    }
695
3.69k
    ActiveChains.erase(MO.getReg());
696
3.69k
697
3.69k
  } else 
if (1.19k
MO.isRegMask()1.19k
) {
698
61
699
61
    for (auto I = ActiveChains.begin(), E = ActiveChains.end();
700
67
         I != E;) {
701
6
      if (MO.clobbersPhysReg(I->first)) {
702
6
        LLVM_DEBUG(dbgs() << "Kill (regmask) seen for chain "
703
6
                          << printReg(I->first, TRI) << "\n");
704
6
        I->second->setKill(MI, Idx, /*Immutable=*/true);
705
6
        ActiveChains.erase(I++);
706
6
      } else
707
0
        ++I;
708
6
    }
709
61
710
61
  }
711
4.88k
}
712
713
544
Color AArch64A57FPLoadBalancing::getColor(unsigned Reg) {
714
544
  if ((TRI->getEncodingValue(Reg) % 2) == 0)
715
409
    return Color::Even;
716
135
  else
717
135
    return Color::Odd;
718
544
}
719
720
// Factory function used by AArch64TargetMachine to add the pass to the passmanager.
721
8.61k
FunctionPass *llvm::createAArch64A57FPLoadBalancing() {
722
8.61k
  return new AArch64A57FPLoadBalancing();
723
8.61k
}