Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/AMDGPU/R600MachineScheduler.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
/// \file
10
/// R600 Machine Scheduler interface
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "R600MachineScheduler.h"
15
#include "AMDGPUSubtarget.h"
16
#include "R600InstrInfo.h"
17
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18
#include "llvm/CodeGen/MachineRegisterInfo.h"
19
#include "llvm/IR/LegacyPassManager.h"
20
#include "llvm/Pass.h"
21
#include "llvm/Support/raw_ostream.h"
22
23
using namespace llvm;
24
25
#define DEBUG_TYPE "machine-scheduler"
26
27
2.33k
void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
28
2.33k
  assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
29
2.33k
  DAG = static_cast<ScheduleDAGMILive*>(dag);
30
2.33k
  const R600Subtarget &ST = DAG->MF.getSubtarget<R600Subtarget>();
31
2.33k
  TII = static_cast<const R600InstrInfo*>(DAG->TII);
32
2.33k
  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
33
2.33k
  VLIW5 = !ST.hasCaymanISA();
34
2.33k
  MRI = &DAG->MRI;
35
2.33k
  CurInstKind = IDOther;
36
2.33k
  CurEmitted = 0;
37
2.33k
  OccupedSlotsMask = 31;
38
2.33k
  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
39
2.33k
  InstKindLimit[IDOther] = 32;
40
2.33k
  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
41
2.33k
  AluInstCount = 0;
42
2.33k
  FetchInstCount = 0;
43
2.33k
}
44
45
void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc,
46
                                  std::vector<SUnit *> &QDst)
47
63.4k
{
48
63.4k
  QDst.insert(QDst.end(), QSrc.begin(), QSrc.end());
49
63.4k
  QSrc.clear();
50
63.4k
}
51
52
4.45k
static unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
53
4.45k
  assert (GPRCount && "GPRCount cannot be 0");
54
4.45k
  return 248 / GPRCount;
55
4.45k
}
56
57
56.9k
SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
58
56.9k
  SUnit *SU = nullptr;
59
56.9k
  NextInstKind = IDOther;
60
56.9k
61
56.9k
  IsTopNode = false;
62
56.9k
63
56.9k
  // check if we might want to switch current clause type
64
56.9k
  bool AllowSwitchToAlu = (CurEmitted >= InstKindLimit[CurInstKind]) ||
65
56.9k
      
(Available[CurInstKind].empty())56.6k
;
66
56.9k
  bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) &&
67
56.9k
      
(239
!Available[IDFetch].empty()239
||
!Available[IDOther].empty()205
);
68
56.9k
69
56.9k
  if (CurInstKind == IDAlu && 
!Available[IDFetch].empty()47.1k
) {
70
4.46k
    // We use the heuristic provided by AMD Accelerated Parallel Processing
71
4.46k
    // OpenCL Programming Guide :
72
4.46k
    // The approx. number of WF that allows TEX inst to hide ALU inst is :
73
4.46k
    // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU))
74
4.46k
    float ALUFetchRationEstimate =
75
4.46k
        (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) /
76
4.46k
        (FetchInstCount + Available[IDFetch].size());
77
4.46k
    if (ALUFetchRationEstimate == 0) {
78
1
      AllowSwitchFromAlu = true;
79
4.45k
    } else {
80
4.45k
      unsigned NeededWF = 62.5f / ALUFetchRationEstimate;
81
4.45k
      LLVM_DEBUG(dbgs() << NeededWF << " approx. Wavefronts Required\n");
82
4.45k
      // We assume the local GPR requirements to be "dominated" by the requirement
83
4.45k
      // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and
84
4.45k
      // after TEX are indeed likely to consume or generate values from/for the
85
4.45k
      // TEX clause.
86
4.45k
      // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause
87
4.45k
      // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need
88
4.45k
      // one GPR) or TmXYZW = TnXYZW (need 2 GPR).
89
4.45k
      // (TODO : use RegisterPressure)
90
4.45k
      // If we are going too use too many GPR, we flush Fetch instruction to lower
91
4.45k
      // register pressure on 128 bits regs.
92
4.45k
      unsigned NearRegisterRequirement = 2 * Available[IDFetch].size();
93
4.45k
      if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement))
94
19
        AllowSwitchFromAlu = true;
95
4.45k
    }
96
4.46k
  }
97
56.9k
98
56.9k
  if (!SU && ((AllowSwitchToAlu && 
CurInstKind != IDAlu51.8k
) ||
99
56.9k
      
(52.2k
!AllowSwitchFromAlu52.2k
&&
CurInstKind == IDAlu52.2k
))) {
100
51.7k
    // try to pick ALU
101
51.7k
    SU = pickAlu();
102
51.7k
    if (!SU && 
!PhysicalRegCopy.empty()6.31k
) {
103
1.73k
      SU = PhysicalRegCopy.front();
104
1.73k
      PhysicalRegCopy.erase(PhysicalRegCopy.begin());
105
1.73k
    }
106
51.7k
    if (SU) {
107
47.1k
      if (CurEmitted >= InstKindLimit[IDAlu])
108
198
        CurEmitted = 0;
109
47.1k
      NextInstKind = IDAlu;
110
47.1k
    }
111
51.7k
  }
112
56.9k
113
56.9k
  if (!SU) {
114
9.74k
    // try to pick FETCH
115
9.74k
    SU = pickOther(IDFetch);
116
9.74k
    if (SU)
117
1.99k
      NextInstKind = IDFetch;
118
9.74k
  }
119
56.9k
120
56.9k
  // try to pick other
121
56.9k
  if (!SU) {
122
7.75k
    SU = pickOther(IDOther);
123
7.75k
    if (SU)
124
5.41k
      NextInstKind = IDOther;
125
7.75k
  }
126
56.9k
127
56.9k
  LLVM_DEBUG(if (SU) {
128
56.9k
    dbgs() << " ** Pick node **\n";
129
56.9k
    DAG->dumpNode(*SU);
130
56.9k
  } else {
131
56.9k
    dbgs() << "NO NODE \n";
132
56.9k
    for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
133
56.9k
      const SUnit &S = DAG->SUnits[i];
134
56.9k
      if (!S.isScheduled)
135
56.9k
        DAG->dumpNode(S);
136
56.9k
    }
137
56.9k
  });
138
56.9k
139
56.9k
  return SU;
140
56.9k
}
141
142
54.5k
void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {
143
54.5k
  if (NextInstKind != CurInstKind) {
144
6.39k
    LLVM_DEBUG(dbgs() << "Instruction Type Switch\n");
145
6.39k
    if (NextInstKind != IDAlu)
146
2.18k
      OccupedSlotsMask |= 31;
147
6.39k
    CurEmitted = 0;
148
6.39k
    CurInstKind = NextInstKind;
149
6.39k
  }
150
54.5k
151
54.5k
  if (CurInstKind == IDAlu) {
152
47.1k
    AluInstCount ++;
153
47.1k
    switch (getAluKind(SU)) {
154
47.1k
    case AluT_XYZW:
155
136
      CurEmitted += 4;
156
136
      break;
157
47.1k
    case AluDiscarded:
158
0
      break;
159
47.1k
    default: {
160
47.0k
      ++CurEmitted;
161
47.0k
      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
162
901k
          E = SU->getInstr()->operands_end(); It != E; 
++It854k
) {
163
854k
        MachineOperand &MO = *It;
164
854k
        if (MO.isReg() && 
MO.getReg() == R600::ALU_LITERAL_X183k
)
165
21.1k
          ++CurEmitted;
166
854k
      }
167
47.0k
    }
168
47.1k
    }
169
47.1k
  } else {
170
7.40k
    ++CurEmitted;
171
7.40k
  }
172
54.5k
173
54.5k
  LLVM_DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");
174
54.5k
175
54.5k
  if (CurInstKind != IDFetch) {
176
52.5k
    MoveUnits(Pending[IDFetch], Available[IDFetch]);
177
52.5k
  } else
178
1.99k
    FetchInstCount++;
179
54.5k
}
180
181
static bool
182
54.5k
isPhysicalRegCopy(MachineInstr *MI) {
183
54.5k
  if (MI->getOpcode() != R600::COPY)
184
52.1k
    return false;
185
2.39k
186
2.39k
  return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
187
2.39k
}
188
189
10.5k
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
190
10.5k
  LLVM_DEBUG(dbgs() << "Top Releasing "; DAG->dumpNode(*SU));
191
10.5k
}
192
193
54.5k
void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
194
54.5k
  LLVM_DEBUG(dbgs() << "Bottom Releasing "; DAG->dumpNode(*SU));
195
54.5k
  if (isPhysicalRegCopy(SU->getInstr())) {
196
1.73k
    PhysicalRegCopy.push_back(SU);
197
1.73k
    return;
198
1.73k
  }
199
52.8k
200
52.8k
  int IK = getInstKind(SU);
201
52.8k
202
52.8k
  // There is no export clause, we can schedule one as soon as its ready
203
52.8k
  if (IK == IDOther)
204
5.41k
    Available[IDOther].push_back(SU);
205
47.4k
  else
206
47.4k
    Pending[IK].push_back(SU);
207
52.8k
208
52.8k
}
209
210
bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
211
309k
                                          const TargetRegisterClass *RC) const {
212
309k
  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
213
0
    return RC->contains(Reg);
214
309k
  } else {
215
309k
    return MRI->getRegClass(Reg) == RC;
216
309k
  }
217
309k
}
218
219
92.6k
R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
220
92.6k
  MachineInstr *MI = SU->getInstr();
221
92.6k
222
92.6k
  if (TII->isTransOnly(*MI))
223
1.84k
    return AluTrans;
224
90.7k
225
90.7k
  switch (MI->getOpcode()) {
226
90.7k
  case R600::PRED_X:
227
170
    return AluPredX;
228
90.7k
  case R600::INTERP_PAIR_XY:
229
62
  case R600::INTERP_PAIR_ZW:
230
62
  case R600::INTERP_VEC_LOAD:
231
62
  case R600::DOT_4:
232
62
    return AluT_XYZW;
233
3.04k
  case R600::COPY:
234
3.04k
    if (MI->getOperand(1).isUndef()) {
235
0
      // MI will become a KILL, don't considers it in scheduling
236
0
      return AluDiscarded;
237
0
    }
238
3.04k
    break;
239
87.4k
  default:
240
87.4k
    break;
241
90.5k
  }
242
90.5k
243
90.5k
  // Does the instruction take a whole IG ?
244
90.5k
  // XXX: Is it possible to add a helper function in R600InstrInfo that can
245
90.5k
  // be used here and in R600PacketizerList::isSoloInstruction() ?
246
90.5k
  if(TII->isVector(*MI) ||
247
90.5k
     
TII->isCubeOp(MI->getOpcode())90.4k
||
248
90.5k
     
TII->isReductionOp(MI->getOpcode())90.4k
||
249
90.5k
     
MI->getOpcode() == R600::GROUP_BARRIER90.4k
) {
250
112
    return AluT_XYZW;
251
112
  }
252
90.4k
253
90.4k
  if (TII->isLDSInstr(MI->getOpcode())) {
254
6.95k
    return AluT_X;
255
6.95k
  }
256
83.4k
257
83.4k
  // Is the result already assigned to a channel ?
258
83.4k
  unsigned DestSubReg = MI->getOperand(0).getSubReg();
259
83.4k
  switch (DestSubReg) {
260
83.4k
  case R600::sub0:
261
4.76k
    return AluT_X;
262
83.4k
  case R600::sub1:
263
4.03k
    return AluT_Y;
264
83.4k
  case R600::sub2:
265
3.69k
    return AluT_Z;
266
83.4k
  case R600::sub3:
267
3.60k
    return AluT_W;
268
83.4k
  default:
269
67.3k
    break;
270
67.3k
  }
271
67.3k
272
67.3k
  // Is the result already member of a X/Y/Z/W class ?
273
67.3k
  unsigned DestReg = MI->getOperand(0).getReg();
274
67.3k
  if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
275
67.3k
      
regBelongsToClass(DestReg, &R600::R600_AddrRegClass)55.6k
)
276
11.7k
    return AluT_X;
277
55.6k
  if (regBelongsToClass(DestReg, &R600::R600_TReg32_YRegClass))
278
2.58k
    return AluT_Y;
279
53.0k
  if (regBelongsToClass(DestReg, &R600::R600_TReg32_ZRegClass))
280
3.94k
    return AluT_Z;
281
49.0k
  if (regBelongsToClass(DestReg, &R600::R600_TReg32_WRegClass))
282
20.2k
    return AluT_W;
283
28.8k
  if (regBelongsToClass(DestReg, &R600::R600_Reg128RegClass))
284
49
    return AluT_XYZW;
285
28.7k
286
28.7k
  // LDS src registers cannot be used in the Trans slot.
287
28.7k
  if (TII->readsLDSSrcReg(*MI))
288
0
    return AluT_XYZW;
289
28.7k
290
28.7k
  return AluAny;
291
28.7k
}
292
293
52.8k
int R600SchedStrategy::getInstKind(SUnit* SU) {
294
52.8k
  int Opcode = SU->getInstr()->getOpcode();
295
52.8k
296
52.8k
  if (TII->usesTextureCache(Opcode) || 
TII->usesVertexCache(Opcode)52.2k
)
297
1.99k
    return IDFetch;
298
50.8k
299
50.8k
  if (TII->isALUInstr(Opcode)) {
300
44.6k
    return IDAlu;
301
44.6k
  }
302
6.19k
303
6.19k
  switch (Opcode) {
304
6.19k
  case R600::PRED_X:
305
772
  case R600::COPY:
306
772
  case R600::CONST_COPY:
307
772
  case R600::INTERP_PAIR_XY:
308
772
  case R600::INTERP_PAIR_ZW:
309
772
  case R600::INTERP_VEC_LOAD:
310
772
  case R600::DOT_4:
311
772
    return IDAlu;
312
5.41k
  default:
313
5.41k
    return IDOther;
314
6.19k
  }
315
6.19k
}
316
317
236k
SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
318
236k
  if (Q.empty())
319
190k
    return nullptr;
320
46.3k
  for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
321
47.8k
      It != E; 
++It1.52k
) {
322
46.9k
    SUnit *SU = *It;
323
46.9k
    InstructionsGroupCandidate.push_back(SU->getInstr());
324
46.9k
    if (TII->fitsConstReadLimitations(InstructionsGroupCandidate) &&
325
46.9k
        
(46.3k
!AnyALU46.3k
||
!TII->isVectorOnly(*SU->getInstr())13.4k
)) {
326
45.4k
      InstructionsGroupCandidate.pop_back();
327
45.4k
      Q.erase((It + 1).base());
328
45.4k
      return SU;
329
45.4k
    } else {
330
1.52k
      InstructionsGroupCandidate.pop_back();
331
1.52k
    }
332
46.9k
  }
333
46.3k
  
return nullptr893
;
334
46.3k
}
335
336
21.7k
void R600SchedStrategy::LoadAlu() {
337
21.7k
  std::vector<SUnit *> &QSrc = Pending[IDAlu];
338
67.1k
  for (unsigned i = 0, e = QSrc.size(); i < e; 
++i45.4k
) {
339
45.4k
    AluKind AK = getAluKind(QSrc[i]);
340
45.4k
    AvailableAlus[AK].push_back(QSrc[i]);
341
45.4k
  }
342
21.7k
  QSrc.clear();
343
21.7k
}
344
345
21.7k
void R600SchedStrategy::PrepareNextSlot() {
346
21.7k
  LLVM_DEBUG(dbgs() << "New Slot\n");
347
21.7k
  assert (OccupedSlotsMask && "Slot wasn't filled");
348
21.7k
  OccupedSlotsMask = 0;
349
21.7k
//  if (HwGen == AMDGPUSubtarget::NORTHERN_ISLANDS)
350
21.7k
//    OccupedSlotsMask |= 16;
351
21.7k
  InstructionsGroupCandidate.clear();
352
21.7k
  LoadAlu();
353
21.7k
}
354
355
28.4k
void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
356
28.4k
  int DstIndex = TII->getOperandIdx(MI->getOpcode(), R600::OpName::dst);
357
28.4k
  if (DstIndex == -1) {
358
51
    return;
359
51
  }
360
28.4k
  unsigned DestReg = MI->getOperand(DstIndex).getReg();
361
28.4k
  // PressureRegister crashes if an operand is def and used in the same inst
362
28.4k
  // and we try to constraint its regclass
363
28.4k
  for (MachineInstr::mop_iterator It = MI->operands_begin(),
364
606k
      E = MI->operands_end(); It != E; 
++It578k
) {
365
578k
    MachineOperand &MO = *It;
366
578k
    if (MO.isReg() && 
!MO.isDef()117k
&&
367
578k
        
MO.getReg() == DestReg88.6k
)
368
7
      return;
369
578k
  }
370
28.4k
  // Constrains the regclass of DestReg to assign it to Slot
371
28.4k
  switch (Slot) {
372
28.4k
  case 0:
373
1.78k
    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_XRegClass);
374
1.78k
    break;
375
28.4k
  case 1:
376
2.54k
    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_YRegClass);
377
2.54k
    break;
378
28.4k
  case 2:
379
3.90k
    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_ZRegClass);
380
3.90k
    break;
381
28.4k
  case 3:
382
20.1k
    MRI->constrainRegClass(DestReg, &R600::R600_TReg32_WRegClass);
383
20.1k
    break;
384
28.4k
  }
385
28.4k
}
386
387
125k
SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot, bool AnyAlu) {
388
125k
  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
389
125k
  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]], AnyAlu);
390
125k
  if (SlotedSU)
391
15.8k
    return SlotedSU;
392
109k
  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny], AnyAlu);
393
109k
  if (UnslotedSU)
394
28.4k
    AssignSlot(UnslotedSU->getInstr(), Slot);
395
109k
  return UnslotedSU;
396
109k
}
397
398
77.9k
unsigned R600SchedStrategy::AvailablesAluCount() const {
399
77.9k
  return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() +
400
77.9k
      AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() +
401
77.9k
      AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() +
402
77.9k
      AvailableAlus[AluTrans].size() + AvailableAlus[AluDiscarded].size() +
403
77.9k
      AvailableAlus[AluPredX].size();
404
77.9k
}
405
406
51.7k
SUnit* R600SchedStrategy::pickAlu() {
407
73.5k
  while (AvailablesAluCount() || 
!Pending[IDAlu].empty()19.7k
) {
408
67.1k
    if (!OccupedSlotsMask) {
409
21.7k
      // Bottom up scheduling : predX must comes first
410
21.7k
      if (!AvailableAlus[AluPredX].empty()) {
411
85
        OccupedSlotsMask |= 31;
412
85
        return PopInst(AvailableAlus[AluPredX], false);
413
85
      }
414
21.6k
      // Flush physical reg copies (RA will discard them)
415
21.6k
      if (!AvailableAlus[AluDiscarded].empty()) {
416
0
        OccupedSlotsMask |= 31;
417
0
        return PopInst(AvailableAlus[AluDiscarded], false);
418
0
      }
419
21.6k
      // If there is a T_XYZW alu available, use it
420
21.6k
      if (!AvailableAlus[AluT_XYZW].empty()) {
421
87
        OccupedSlotsMask |= 15;
422
87
        return PopInst(AvailableAlus[AluT_XYZW], false);
423
87
      }
424
67.0k
    }
425
67.0k
    bool TransSlotOccuped = OccupedSlotsMask & 16;
426
67.0k
    if (!TransSlotOccuped && 
VLIW529.9k
) {
427
23.9k
      if (!AvailableAlus[AluTrans].empty()) {
428
924
        OccupedSlotsMask |= 16;
429
924
        return PopInst(AvailableAlus[AluTrans], false);
430
924
      }
431
22.9k
      SUnit *SU = AttemptFillSlot(3, true);
432
22.9k
      if (SU) {
433
12.4k
        OccupedSlotsMask |= 16;
434
12.4k
        return SU;
435
12.4k
      }
436
53.6k
    }
437
192k
    
for (int Chan = 3; 53.6k
Chan > -1;
--Chan138k
) {
438
170k
      bool isOccupied = OccupedSlotsMask & (1 << Chan);
439
170k
      if (!isOccupied) {
440
102k
        SUnit *SU = AttemptFillSlot(Chan, false);
441
102k
        if (SU) {
442
31.8k
          OccupedSlotsMask |= (1 << Chan);
443
31.8k
          InstructionsGroupCandidate.push_back(SU->getInstr());
444
31.8k
          return SU;
445
31.8k
        }
446
102k
      }
447
170k
    }
448
53.6k
    PrepareNextSlot();
449
21.7k
  }
450
51.7k
  
return nullptr6.31k
;
451
51.7k
}
452
453
17.4k
SUnit* R600SchedStrategy::pickOther(int QID) {
454
17.4k
  SUnit *SU = nullptr;
455
17.4k
  std::vector<SUnit *> &AQ = Available[QID];
456
17.4k
457
17.4k
  if (AQ.empty()) {
458
10.8k
    MoveUnits(Pending[QID], AQ);
459
10.8k
  }
460
17.4k
  if (!AQ.empty()) {
461
7.40k
    SU = AQ.back();
462
7.40k
    AQ.pop_back();
463
7.40k
  }
464
17.4k
  return SU;
465
17.4k
}