Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/X86/X86InstrInfo.cpp
</
Line
Count
Source (jump to first uncovered line)
1
//===-- X86InstrInfo.cpp - X86 Instruction Information --------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file contains the X86 implementation of the TargetInstrInfo class.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "X86InstrInfo.h"
14
#include "X86.h"
15
#include "X86InstrBuilder.h"
16
#include "X86InstrFoldTables.h"
17
#include "X86MachineFunctionInfo.h"
18
#include "X86Subtarget.h"
19
#include "X86TargetMachine.h"
20
#include "llvm/ADT/STLExtras.h"
21
#include "llvm/ADT/Sequence.h"
22
#include "llvm/CodeGen/LivePhysRegs.h"
23
#include "llvm/CodeGen/LiveVariables.h"
24
#include "llvm/CodeGen/MachineConstantPool.h"
25
#include "llvm/CodeGen/MachineDominators.h"
26
#include "llvm/CodeGen/MachineFrameInfo.h"
27
#include "llvm/CodeGen/MachineInstrBuilder.h"
28
#include "llvm/CodeGen/MachineModuleInfo.h"
29
#include "llvm/CodeGen/MachineRegisterInfo.h"
30
#include "llvm/CodeGen/StackMaps.h"
31
#include "llvm/IR/DerivedTypes.h"
32
#include "llvm/IR/Function.h"
33
#include "llvm/IR/LLVMContext.h"
34
#include "llvm/MC/MCAsmInfo.h"
35
#include "llvm/MC/MCExpr.h"
36
#include "llvm/MC/MCInst.h"
37
#include "llvm/Support/CommandLine.h"
38
#include "llvm/Support/Debug.h"
39
#include "llvm/Support/ErrorHandling.h"
40
#include "llvm/Support/raw_ostream.h"
41
#include "llvm/Target/TargetOptions.h"
42
43
using namespace llvm;
44
45
#define DEBUG_TYPE "x86-instr-info"
46
47
#define GET_INSTRINFO_CTOR_DTOR
48
#include "X86GenInstrInfo.inc"
49
50
static cl::opt<bool>
51
    NoFusing("disable-spill-fusing",
52
             cl::desc("Disable fusing of spill code into instructions"),
53
             cl::Hidden);
54
static cl::opt<bool>
55
PrintFailedFusing("print-failed-fuse-candidates",
56
                  cl::desc("Print instructions that the allocator wants to"
57
                           " fuse, but the X86 backend currently can't"),
58
                  cl::Hidden);
59
static cl::opt<bool>
60
ReMatPICStubLoad("remat-pic-stub-load",
61
                 cl::desc("Re-materialize load from stub in PIC mode"),
62
                 cl::init(false), cl::Hidden);
63
static cl::opt<unsigned>
64
PartialRegUpdateClearance("partial-reg-update-clearance",
65
                          cl::desc("Clearance between two register writes "
66
                                   "for inserting XOR to avoid partial "
67
                                   "register update"),
68
                          cl::init(64), cl::Hidden);
69
static cl::opt<unsigned>
70
UndefRegClearance("undef-reg-clearance",
71
                  cl::desc("How many idle instructions we would like before "
72
                           "certain undef register reads"),
73
                  cl::init(128), cl::Hidden);
74
75
76
// Pin the vtable to this file.
77
0
void X86InstrInfo::anchor() {}
78
79
X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
80
    : X86GenInstrInfo((STI.isTarget64BitLP64() ? X86::ADJCALLSTACKDOWN64
81
                                               : X86::ADJCALLSTACKDOWN32),
82
                      (STI.isTarget64BitLP64() ? X86::ADJCALLSTACKUP64
83
                                               : X86::ADJCALLSTACKUP32),
84
                      X86::CATCHRET,
85
                      (STI.is64Bit() ? X86::RETQ : X86::RETL)),
86
15.2k
      Subtarget(STI), RI(STI.getTargetTriple()) {
87
15.2k
}
88
89
bool
90
X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
91
                                    unsigned &SrcReg, unsigned &DstReg,
92
2.99M
                                    unsigned &SubIdx) const {
93
2.99M
  switch (MI.getOpcode()) {
94
2.99M
  
default: break2.98M
;
95
2.99M
  case X86::MOVSX16rr8:
96
6.91k
  case X86::MOVZX16rr8:
97
6.91k
  case X86::MOVSX32rr8:
98
6.91k
  case X86::MOVZX32rr8:
99
6.91k
  case X86::MOVSX64rr8:
100
6.91k
    if (!Subtarget.is64Bit())
101
1.88k
      // It's not always legal to reference the low 8-bit of the larger
102
1.88k
      // register in 32-bit mode.
103
1.88k
      return false;
104
5.02k
    LLVM_FALLTHROUGH;
105
7.70k
  case X86::MOVSX32rr16:
106
7.70k
  case X86::MOVZX32rr16:
107
7.70k
  case X86::MOVSX64rr16:
108
7.70k
  case X86::MOVSX64rr32: {
109
7.70k
    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
110
0
      // Be conservative.
111
0
      return false;
112
7.70k
    SrcReg = MI.getOperand(1).getReg();
113
7.70k
    DstReg = MI.getOperand(0).getReg();
114
7.70k
    switch (MI.getOpcode()) {
115
7.70k
    
default: 0
llvm_unreachable0
("Unreachable!");
116
7.70k
    case X86::MOVSX16rr8:
117
5.02k
    case X86::MOVZX16rr8:
118
5.02k
    case X86::MOVSX32rr8:
119
5.02k
    case X86::MOVZX32rr8:
120
5.02k
    case X86::MOVSX64rr8:
121
5.02k
      SubIdx = X86::sub_8bit;
122
5.02k
      break;
123
5.02k
    case X86::MOVSX32rr16:
124
1.48k
    case X86::MOVZX32rr16:
125
1.48k
    case X86::MOVSX64rr16:
126
1.48k
      SubIdx = X86::sub_16bit;
127
1.48k
      break;
128
1.48k
    case X86::MOVSX64rr32:
129
1.19k
      SubIdx = X86::sub_32bit;
130
1.19k
      break;
131
7.70k
    }
132
7.70k
    return true;
133
7.70k
  }
134
2.98M
  }
135
2.98M
  return false;
136
2.98M
}
137
138
11.3k
int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
139
11.3k
  const MachineFunction *MF = MI.getParent()->getParent();
140
11.3k
  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
141
11.3k
142
11.3k
  if (isFrameInstr(MI)) {
143
2.78k
    unsigned StackAlign = TFI->getStackAlignment();
144
2.78k
    int SPAdj = alignTo(getFrameSize(MI), StackAlign);
145
2.78k
    SPAdj -= getFrameAdjustment(MI);
146
2.78k
    if (!isFrameSetup(MI))
147
1.39k
      SPAdj = -SPAdj;
148
2.78k
    return SPAdj;
149
2.78k
  }
150
8.52k
151
8.52k
  // To know whether a call adjusts the stack, we need information
152
8.52k
  // that is bound to the following ADJCALLSTACKUP pseudo.
153
8.52k
  // Look for the next ADJCALLSTACKUP that follows the call.
154
8.52k
  if (MI.isCall()) {
155
1.38k
    const MachineBasicBlock *MBB = MI.getParent();
156
1.38k
    auto I = ++MachineBasicBlock::const_iterator(MI);
157
1.39k
    for (auto E = MBB->end(); I != E; 
++I12
) {
158
1.39k
      if (I->getOpcode() == getCallFrameDestroyOpcode() ||
159
1.39k
          
I->isCall()12
)
160
1.38k
        break;
161
1.39k
    }
162
1.38k
163
1.38k
    // If we could not find a frame destroy opcode, then it has already
164
1.38k
    // been simplified, so we don't care.
165
1.38k
    if (I->getOpcode() != getCallFrameDestroyOpcode())
166
0
      return 0;
167
1.38k
168
1.38k
    return -(I->getOperand(1).getImm());
169
1.38k
  }
170
7.14k
171
7.14k
  // Currently handle only PUSHes we can reasonably expect to see
172
7.14k
  // in call sequences
173
7.14k
  switch (MI.getOpcode()) {
174
7.14k
  default:
175
3.76k
    return 0;
176
7.14k
  case X86::PUSH32i8:
177
3.27k
  case X86::PUSH32r:
178
3.27k
  case X86::PUSH32rmm:
179
3.27k
  case X86::PUSH32rmr:
180
3.27k
  case X86::PUSHi32:
181
3.27k
    return 4;
182
3.27k
  case X86::PUSH64i8:
183
108
  case X86::PUSH64r:
184
108
  case X86::PUSH64rmm:
185
108
  case X86::PUSH64rmr:
186
108
  case X86::PUSH64i32:
187
108
    return 8;
188
7.14k
  }
189
7.14k
}
190
191
/// Return true and the FrameIndex if the specified
192
/// operand and follow operands form a reference to the stack frame.
193
bool X86InstrInfo::isFrameOperand(const MachineInstr &MI, unsigned int Op,
194
467k
                                  int &FrameIndex) const {
195
467k
  if (MI.getOperand(Op + X86::AddrBaseReg).isFI() &&
196
467k
      
MI.getOperand(Op + X86::AddrScaleAmt).isImm()139k
&&
197
467k
      
MI.getOperand(Op + X86::AddrIndexReg).isReg()139k
&&
198
467k
      
MI.getOperand(Op + X86::AddrDisp).isImm()139k
&&
199
467k
      
MI.getOperand(Op + X86::AddrScaleAmt).getImm() == 1139k
&&
200
467k
      
MI.getOperand(Op + X86::AddrIndexReg).getReg() == 0139k
&&
201
467k
      
MI.getOperand(Op + X86::AddrDisp).getImm() == 0139k
) {
202
110k
    FrameIndex = MI.getOperand(Op + X86::AddrBaseReg).getIndex();
203
110k
    return true;
204
110k
  }
205
357k
  return false;
206
357k
}
207
208
2.16M
static bool isFrameLoadOpcode(int Opcode, unsigned &MemBytes) {
209
2.16M
  switch (Opcode) {
210
2.16M
  default:
211
1.70M
    return false;
212
2.16M
  case X86::MOV8rm:
213
23.5k
  case X86::KMOVBkm:
214
23.5k
    MemBytes = 1;
215
23.5k
    return true;
216
23.5k
  case X86::MOV16rm:
217
3.58k
  case X86::KMOVWkm:
218
3.58k
    MemBytes = 2;
219
3.58k
    return true;
220
166k
  case X86::MOV32rm:
221
166k
  case X86::MOVSSrm:
222
166k
  case X86::MOVSSrm_alt:
223
166k
  case X86::VMOVSSrm:
224
166k
  case X86::VMOVSSrm_alt:
225
166k
  case X86::VMOVSSZrm:
226
166k
  case X86::VMOVSSZrm_alt:
227
166k
  case X86::KMOVDkm:
228
166k
    MemBytes = 4;
229
166k
    return true;
230
185k
  case X86::MOV64rm:
231
185k
  case X86::LD_Fp64m:
232
185k
  case X86::MOVSDrm:
233
185k
  case X86::MOVSDrm_alt:
234
185k
  case X86::VMOVSDrm:
235
185k
  case X86::VMOVSDrm_alt:
236
185k
  case X86::VMOVSDZrm:
237
185k
  case X86::VMOVSDZrm_alt:
238
185k
  case X86::MMX_MOVD64rm:
239
185k
  case X86::MMX_MOVQ64rm:
240
185k
  case X86::KMOVQkm:
241
185k
    MemBytes = 8;
242
185k
    return true;
243
185k
  case X86::MOVAPSrm:
244
62.6k
  case X86::MOVUPSrm:
245
62.6k
  case X86::MOVAPDrm:
246
62.6k
  case X86::MOVUPDrm:
247
62.6k
  case X86::MOVDQArm:
248
62.6k
  case X86::MOVDQUrm:
249
62.6k
  case X86::VMOVAPSrm:
250
62.6k
  case X86::VMOVUPSrm:
251
62.6k
  case X86::VMOVAPDrm:
252
62.6k
  case X86::VMOVUPDrm:
253
62.6k
  case X86::VMOVDQArm:
254
62.6k
  case X86::VMOVDQUrm:
255
62.6k
  case X86::VMOVAPSZ128rm:
256
62.6k
  case X86::VMOVUPSZ128rm:
257
62.6k
  case X86::VMOVAPSZ128rm_NOVLX:
258
62.6k
  case X86::VMOVUPSZ128rm_NOVLX:
259
62.6k
  case X86::VMOVAPDZ128rm:
260
62.6k
  case X86::VMOVUPDZ128rm:
261
62.6k
  case X86::VMOVDQU8Z128rm:
262
62.6k
  case X86::VMOVDQU16Z128rm:
263
62.6k
  case X86::VMOVDQA32Z128rm:
264
62.6k
  case X86::VMOVDQU32Z128rm:
265
62.6k
  case X86::VMOVDQA64Z128rm:
266
62.6k
  case X86::VMOVDQU64Z128rm:
267
62.6k
    MemBytes = 16;
268
62.6k
    return true;
269
62.6k
  case X86::VMOVAPSYrm:
270
20.9k
  case X86::VMOVUPSYrm:
271
20.9k
  case X86::VMOVAPDYrm:
272
20.9k
  case X86::VMOVUPDYrm:
273
20.9k
  case X86::VMOVDQAYrm:
274
20.9k
  case X86::VMOVDQUYrm:
275
20.9k
  case X86::VMOVAPSZ256rm:
276
20.9k
  case X86::VMOVUPSZ256rm:
277
20.9k
  case X86::VMOVAPSZ256rm_NOVLX:
278
20.9k
  case X86::VMOVUPSZ256rm_NOVLX:
279
20.9k
  case X86::VMOVAPDZ256rm:
280
20.9k
  case X86::VMOVUPDZ256rm:
281
20.9k
  case X86::VMOVDQU8Z256rm:
282
20.9k
  case X86::VMOVDQU16Z256rm:
283
20.9k
  case X86::VMOVDQA32Z256rm:
284
20.9k
  case X86::VMOVDQU32Z256rm:
285
20.9k
  case X86::VMOVDQA64Z256rm:
286
20.9k
  case X86::VMOVDQU64Z256rm:
287
20.9k
    MemBytes = 32;
288
20.9k
    return true;
289
20.9k
  case X86::VMOVAPSZrm:
290
3.42k
  case X86::VMOVUPSZrm:
291
3.42k
  case X86::VMOVAPDZrm:
292
3.42k
  case X86::VMOVUPDZrm:
293
3.42k
  case X86::VMOVDQU8Zrm:
294
3.42k
  case X86::VMOVDQU16Zrm:
295
3.42k
  case X86::VMOVDQA32Zrm:
296
3.42k
  case X86::VMOVDQU32Zrm:
297
3.42k
  case X86::VMOVDQA64Zrm:
298
3.42k
  case X86::VMOVDQU64Zrm:
299
3.42k
    MemBytes = 64;
300
3.42k
    return true;
301
2.16M
  }
302
2.16M
}
303
304
1.33M
static bool isFrameStoreOpcode(int Opcode, unsigned &MemBytes) {
305
1.33M
  switch (Opcode) {
306
1.33M
  default:
307
1.21M
    return false;
308
1.33M
  case X86::MOV8mr:
309
7.34k
  case X86::KMOVBmk:
310
7.34k
    MemBytes = 1;
311
7.34k
    return true;
312
7.34k
  case X86::MOV16mr:
313
5.99k
  case X86::KMOVWmk:
314
5.99k
    MemBytes = 2;
315
5.99k
    return true;
316
36.7k
  case X86::MOV32mr:
317
36.7k
  case X86::MOVSSmr:
318
36.7k
  case X86::VMOVSSmr:
319
36.7k
  case X86::VMOVSSZmr:
320
36.7k
  case X86::KMOVDmk:
321
36.7k
    MemBytes = 4;
322
36.7k
    return true;
323
36.7k
  case X86::MOV64mr:
324
27.9k
  case X86::ST_FpP64m:
325
27.9k
  case X86::MOVSDmr:
326
27.9k
  case X86::VMOVSDmr:
327
27.9k
  case X86::VMOVSDZmr:
328
27.9k
  case X86::MMX_MOVD64mr:
329
27.9k
  case X86::MMX_MOVQ64mr:
330
27.9k
  case X86::MMX_MOVNTQmr:
331
27.9k
  case X86::KMOVQmk:
332
27.9k
    MemBytes = 8;
333
27.9k
    return true;
334
27.9k
  case X86::MOVAPSmr:
335
23.8k
  case X86::MOVUPSmr:
336
23.8k
  case X86::MOVAPDmr:
337
23.8k
  case X86::MOVUPDmr:
338
23.8k
  case X86::MOVDQAmr:
339
23.8k
  case X86::MOVDQUmr:
340
23.8k
  case X86::VMOVAPSmr:
341
23.8k
  case X86::VMOVUPSmr:
342
23.8k
  case X86::VMOVAPDmr:
343
23.8k
  case X86::VMOVUPDmr:
344
23.8k
  case X86::VMOVDQAmr:
345
23.8k
  case X86::VMOVDQUmr:
346
23.8k
  case X86::VMOVUPSZ128mr:
347
23.8k
  case X86::VMOVAPSZ128mr:
348
23.8k
  case X86::VMOVUPSZ128mr_NOVLX:
349
23.8k
  case X86::VMOVAPSZ128mr_NOVLX:
350
23.8k
  case X86::VMOVUPDZ128mr:
351
23.8k
  case X86::VMOVAPDZ128mr:
352
23.8k
  case X86::VMOVDQA32Z128mr:
353
23.8k
  case X86::VMOVDQU32Z128mr:
354
23.8k
  case X86::VMOVDQA64Z128mr:
355
23.8k
  case X86::VMOVDQU64Z128mr:
356
23.8k
  case X86::VMOVDQU8Z128mr:
357
23.8k
  case X86::VMOVDQU16Z128mr:
358
23.8k
    MemBytes = 16;
359
23.8k
    return true;
360
23.8k
  case X86::VMOVUPSYmr:
361
7.16k
  case X86::VMOVAPSYmr:
362
7.16k
  case X86::VMOVUPDYmr:
363
7.16k
  case X86::VMOVAPDYmr:
364
7.16k
  case X86::VMOVDQUYmr:
365
7.16k
  case X86::VMOVDQAYmr:
366
7.16k
  case X86::VMOVUPSZ256mr:
367
7.16k
  case X86::VMOVAPSZ256mr:
368
7.16k
  case X86::VMOVUPSZ256mr_NOVLX:
369
7.16k
  case X86::VMOVAPSZ256mr_NOVLX:
370
7.16k
  case X86::VMOVUPDZ256mr:
371
7.16k
  case X86::VMOVAPDZ256mr:
372
7.16k
  case X86::VMOVDQU8Z256mr:
373
7.16k
  case X86::VMOVDQU16Z256mr:
374
7.16k
  case X86::VMOVDQA32Z256mr:
375
7.16k
  case X86::VMOVDQU32Z256mr:
376
7.16k
  case X86::VMOVDQA64Z256mr:
377
7.16k
  case X86::VMOVDQU64Z256mr:
378
7.16k
    MemBytes = 32;
379
7.16k
    return true;
380
7.16k
  case X86::VMOVUPSZmr:
381
1.81k
  case X86::VMOVAPSZmr:
382
1.81k
  case X86::VMOVUPDZmr:
383
1.81k
  case X86::VMOVAPDZmr:
384
1.81k
  case X86::VMOVDQU8Zmr:
385
1.81k
  case X86::VMOVDQU16Zmr:
386
1.81k
  case X86::VMOVDQA32Zmr:
387
1.81k
  case X86::VMOVDQU32Zmr:
388
1.81k
  case X86::VMOVDQA64Zmr:
389
1.81k
  case X86::VMOVDQU64Zmr:
390
1.81k
    MemBytes = 64;
391
1.81k
    return true;
392
0
  }
393
0
  return false;
394
0
}
395
396
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
397
802k
                                           int &FrameIndex) const {
398
802k
  unsigned Dummy;
399
802k
  return X86InstrInfo::isLoadFromStackSlot(MI, FrameIndex, Dummy);
400
802k
}
401
402
unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
403
                                           int &FrameIndex,
404
1.31M
                                           unsigned &MemBytes) const {
405
1.31M
  if (isFrameLoadOpcode(MI.getOpcode(), MemBytes))
406
401k
    if (MI.getOperand(0).getSubReg() == 0 && 
isFrameOperand(MI, 1, FrameIndex)395k
)
407
103k
      return MI.getOperand(0).getReg();
408
1.21M
  return 0;
409
1.21M
}
410
411
unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr &MI,
412
851k
                                                 int &FrameIndex) const {
413
851k
  unsigned Dummy;
414
851k
  if (isFrameLoadOpcode(MI.getOpcode(), Dummy)) {
415
65.0k
    unsigned Reg;
416
65.0k
    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
417
0
      return Reg;
418
65.0k
    // Check for post-frame index elimination operations
419
65.0k
    SmallVector<const MachineMemOperand *, 1> Accesses;
420
65.0k
    if (hasLoadFromStackSlot(MI, Accesses)) {
421
29.5k
      FrameIndex =
422
29.5k
          cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
423
29.5k
              ->getFrameIndex();
424
29.5k
      return 1;
425
29.5k
    }
426
821k
  }
427
821k
  return 0;
428
821k
}
429
430
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
431
502k
                                          int &FrameIndex) const {
432
502k
  unsigned Dummy;
433
502k
  return X86InstrInfo::isStoreToStackSlot(MI, FrameIndex, Dummy);
434
502k
}
435
436
unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
437
                                          int &FrameIndex,
438
528k
                                          unsigned &MemBytes) const {
439
528k
  if (isFrameStoreOpcode(MI.getOpcode(), MemBytes))
440
73.4k
    if (MI.getOperand(X86::AddrNumOperands).getSubReg() == 0 &&
441
73.4k
        
isFrameOperand(MI, 0, FrameIndex)72.2k
)
442
6.66k
      return MI.getOperand(X86::AddrNumOperands).getReg();
443
521k
  return 0;
444
521k
}
445
446
unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
447
802k
                                                int &FrameIndex) const {
448
802k
  unsigned Dummy;
449
802k
  if (isFrameStoreOpcode(MI.getOpcode(), Dummy)) {
450
37.4k
    unsigned Reg;
451
37.4k
    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
452
0
      return Reg;
453
37.4k
    // Check for post-frame index elimination operations
454
37.4k
    SmallVector<const MachineMemOperand *, 1> Accesses;
455
37.4k
    if (hasStoreToStackSlot(MI, Accesses)) {
456
16.1k
      FrameIndex =
457
16.1k
          cast<FixedStackPseudoSourceValue>(Accesses.front()->getPseudoValue())
458
16.1k
              ->getFrameIndex();
459
16.1k
      return 1;
460
16.1k
    }
461
786k
  }
462
786k
  return 0;
463
786k
}
464
465
/// Return true if register is PIC base; i.e.g defined by X86::MOVPC32r.
466
68.5k
static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
467
68.5k
  // Don't waste compile time scanning use-def chains of physregs.
468
68.5k
  if (!TargetRegisterInfo::isVirtualRegister(BaseReg))
469
42.1k
    return false;
470
26.4k
  bool isPICBase = false;
471
26.4k
  for (MachineRegisterInfo::def_instr_iterator I = MRI.def_instr_begin(BaseReg),
472
34.6k
         E = MRI.def_instr_end(); I != E; 
++I8.18k
) {
473
26.4k
    MachineInstr *DefMI = &*I;
474
26.4k
    if (DefMI->getOpcode() != X86::MOVPC32r)
475
18.3k
      return false;
476
8.18k
    assert(!isPICBase && "More than one PIC base?");
477
8.18k
    isPICBase = true;
478
8.18k
  }
479
26.4k
  
return isPICBase8.11k
;
480
26.4k
}
481
482
bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
483
388k
                                                     AliasAnalysis *AA) const {
484
388k
  switch (MI.getOpcode()) {
485
388k
  
default: break128k
;
486
388k
  case X86::MOV8rm:
487
147k
  case X86::MOV8rm_NOREX:
488
147k
  case X86::MOV16rm:
489
147k
  case X86::MOV32rm:
490
147k
  case X86::MOV64rm:
491
147k
  case X86::MOVSSrm:
492
147k
  case X86::MOVSSrm_alt:
493
147k
  case X86::MOVSDrm:
494
147k
  case X86::MOVSDrm_alt:
495
147k
  case X86::MOVAPSrm:
496
147k
  case X86::MOVUPSrm:
497
147k
  case X86::MOVAPDrm:
498
147k
  case X86::MOVUPDrm:
499
147k
  case X86::MOVDQArm:
500
147k
  case X86::MOVDQUrm:
501
147k
  case X86::VMOVSSrm:
502
147k
  case X86::VMOVSSrm_alt:
503
147k
  case X86::VMOVSDrm:
504
147k
  case X86::VMOVSDrm_alt:
505
147k
  case X86::VMOVAPSrm:
506
147k
  case X86::VMOVUPSrm:
507
147k
  case X86::VMOVAPDrm:
508
147k
  case X86::VMOVUPDrm:
509
147k
  case X86::VMOVDQArm:
510
147k
  case X86::VMOVDQUrm:
511
147k
  case X86::VMOVAPSYrm:
512
147k
  case X86::VMOVUPSYrm:
513
147k
  case X86::VMOVAPDYrm:
514
147k
  case X86::VMOVUPDYrm:
515
147k
  case X86::VMOVDQAYrm:
516
147k
  case X86::VMOVDQUYrm:
517
147k
  case X86::MMX_MOVD64rm:
518
147k
  case X86::MMX_MOVQ64rm:
519
147k
  // AVX-512
520
147k
  case X86::VMOVSSZrm:
521
147k
  case X86::VMOVSSZrm_alt:
522
147k
  case X86::VMOVSDZrm:
523
147k
  case X86::VMOVSDZrm_alt:
524
147k
  case X86::VMOVAPDZ128rm:
525
147k
  case X86::VMOVAPDZ256rm:
526
147k
  case X86::VMOVAPDZrm:
527
147k
  case X86::VMOVAPSZ128rm:
528
147k
  case X86::VMOVAPSZ256rm:
529
147k
  case X86::VMOVAPSZ128rm_NOVLX:
530
147k
  case X86::VMOVAPSZ256rm_NOVLX:
531
147k
  case X86::VMOVAPSZrm:
532
147k
  case X86::VMOVDQA32Z128rm:
533
147k
  case X86::VMOVDQA32Z256rm:
534
147k
  case X86::VMOVDQA32Zrm:
535
147k
  case X86::VMOVDQA64Z128rm:
536
147k
  case X86::VMOVDQA64Z256rm:
537
147k
  case X86::VMOVDQA64Zrm:
538
147k
  case X86::VMOVDQU16Z128rm:
539
147k
  case X86::VMOVDQU16Z256rm:
540
147k
  case X86::VMOVDQU16Zrm:
541
147k
  case X86::VMOVDQU32Z128rm:
542
147k
  case X86::VMOVDQU32Z256rm:
543
147k
  case X86::VMOVDQU32Zrm:
544
147k
  case X86::VMOVDQU64Z128rm:
545
147k
  case X86::VMOVDQU64Z256rm:
546
147k
  case X86::VMOVDQU64Zrm:
547
147k
  case X86::VMOVDQU8Z128rm:
548
147k
  case X86::VMOVDQU8Z256rm:
549
147k
  case X86::VMOVDQU8Zrm:
550
147k
  case X86::VMOVUPDZ128rm:
551
147k
  case X86::VMOVUPDZ256rm:
552
147k
  case X86::VMOVUPDZrm:
553
147k
  case X86::VMOVUPSZ128rm:
554
147k
  case X86::VMOVUPSZ256rm:
555
147k
  case X86::VMOVUPSZ128rm_NOVLX:
556
147k
  case X86::VMOVUPSZ256rm_NOVLX:
557
147k
  case X86::VMOVUPSZrm: {
558
147k
    // Loads from constant pools are trivially rematerializable.
559
147k
    if (MI.getOperand(1 + X86::AddrBaseReg).isReg() &&
560
147k
        
MI.getOperand(1 + X86::AddrScaleAmt).isImm()92.8k
&&
561
147k
        
MI.getOperand(1 + X86::AddrIndexReg).isReg()92.8k
&&
562
147k
        
MI.getOperand(1 + X86::AddrIndexReg).getReg() == 092.8k
&&
563
147k
        
MI.isDereferenceableInvariantLoad(AA)82.5k
) {
564
24.4k
      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
565
24.4k
      if (BaseReg == 0 || 
BaseReg == X86::RIP23.1k
)
566
22.0k
        return true;
567
2.38k
      // Allow re-materialization of PIC load.
568
2.38k
      if (!ReMatPICStubLoad && MI.getOperand(1 + X86::AddrDisp).isGlobal())
569
2.11k
        return false;
570
263
      const MachineFunction &MF = *MI.getParent()->getParent();
571
263
      const MachineRegisterInfo &MRI = MF.getRegInfo();
572
263
      return regIsPICBase(BaseReg, MRI);
573
263
    }
574
123k
    return false;
575
123k
  }
576
123k
577
123k
  case X86::LEA32r:
578
112k
  case X86::LEA64r: {
579
112k
    if (MI.getOperand(1 + X86::AddrScaleAmt).isImm() &&
580
112k
        MI.getOperand(1 + X86::AddrIndexReg).isReg() &&
581
112k
        MI.getOperand(1 + X86::AddrIndexReg).getReg() == 0 &&
582
112k
        
!MI.getOperand(1 + X86::AddrDisp).isReg()99.8k
) {
583
99.8k
      // lea fi#, lea GV, etc. are all rematerializable.
584
99.8k
      if (!MI.getOperand(1 + X86::AddrBaseReg).isReg())
585
31.5k
        return true;
586
68.3k
      unsigned BaseReg = MI.getOperand(1 + X86::AddrBaseReg).getReg();
587
68.3k
      if (BaseReg == 0)
588
8
        return true;
589
68.3k
      // Allow re-materialization of lea PICBase + x.
590
68.3k
      const MachineFunction &MF = *MI.getParent()->getParent();
591
68.3k
      const MachineRegisterInfo &MRI = MF.getRegInfo();
592
68.3k
      return regIsPICBase(BaseReg, MRI);
593
68.3k
    }
594
12.3k
    return false;
595
12.3k
  }
596
128k
  }
597
128k
598
128k
  // All other instructions marked M_REMATERIALIZABLE are always trivially
599
128k
  // rematerializable.
600
128k
  return true;
601
128k
}
602
603
void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
604
                                 MachineBasicBlock::iterator I,
605
                                 unsigned DestReg, unsigned SubIdx,
606
                                 const MachineInstr &Orig,
607
86.8k
                                 const TargetRegisterInfo &TRI) const {
608
86.8k
  bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
609
86.8k
  if (ClobbersEFLAGS && 
!isSafeToClobberEFLAGS(MBB, I)36.2k
) {
610
1.00k
    // The instruction clobbers EFLAGS. Re-materialize as MOV32ri to avoid side
611
1.00k
    // effects.
612
1.00k
    int Value;
613
1.00k
    switch (Orig.getOpcode()) {
614
1.00k
    
case X86::MOV32r0: Value = 0; break1.00k
;
615
1.00k
    
case X86::MOV32r1: Value = 1; break0
;
616
1.00k
    
case X86::MOV32r_1: Value = -1; break3
;
617
1.00k
    default:
618
0
      llvm_unreachable("Unexpected instruction!");
619
1.00k
    }
620
1.00k
621
1.00k
    const DebugLoc &DL = Orig.getDebugLoc();
622
1.00k
    BuildMI(MBB, I, DL, get(X86::MOV32ri))
623
1.00k
        .add(Orig.getOperand(0))
624
1.00k
        .addImm(Value);
625
85.8k
  } else {
626
85.8k
    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
627
85.8k
    MBB.insert(I, MI);
628
85.8k
  }
629
86.8k
630
86.8k
  MachineInstr &NewMI = *std::prev(I);
631
86.8k
  NewMI.substituteRegister(Orig.getOperand(0).getReg(), DestReg, SubIdx, TRI);
632
86.8k
}
633
634
/// True if MI has a condition code def, e.g. EFLAGS, that is not marked dead.
635
27.6k
bool X86InstrInfo::hasLiveCondCodeDef(MachineInstr &MI) const {
636
131k
  for (unsigned i = 0, e = MI.getNumOperands(); i != e; 
++i103k
) {
637
104k
    MachineOperand &MO = MI.getOperand(i);
638
104k
    if (MO.isReg() && 
MO.isDef()87.3k
&&
639
104k
        
MO.getReg() == X86::EFLAGS54.7k
&&
!MO.isDead()27.1k
) {
640
959
      return true;
641
959
    }
642
104k
  }
643
27.6k
  
return false26.7k
;
644
27.6k
}
645
646
/// Check whether the shift count for a machine operand is non-zero.
647
inline static unsigned getTruncatedShiftCount(const MachineInstr &MI,
648
3.50k
                                              unsigned ShiftAmtOperandIdx) {
649
3.50k
  // The shift count is six bits with the REX.W prefix and five bits without.
650
3.50k
  unsigned ShiftCountMask = (MI.getDesc().TSFlags & X86II::REX_W) ? 
632.73k
:
31771
;
651
3.50k
  unsigned Imm = MI.getOperand(ShiftAmtOperandIdx).getImm();
652
3.50k
  return Imm & ShiftCountMask;
653
3.50k
}
654
655
/// Check whether the given shift count is appropriate
656
/// can be represented by a LEA instruction.
657
3.03k
inline static bool isTruncatedShiftCountForLEA(unsigned ShAmt) {
658
3.03k
  // Left shift instructions can be transformed into load-effective-address
659
3.03k
  // instructions if we can encode them appropriately.
660
3.03k
  // A LEA instruction utilizes a SIB byte to encode its scale factor.
661
3.03k
  // The SIB.scale field is two bits wide which means that we can encode any
662
3.03k
  // shift amount less than 4.
663
3.03k
  return ShAmt < 4 && 
ShAmt > 01.23k
;
664
3.03k
}
665
666
bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
667
                                  unsigned Opc, bool AllowSP, unsigned &NewSrc,
668
                                  bool &isKill, MachineOperand &ImplicitOp,
669
14.1k
                                  LiveVariables *LV) const {
670
14.1k
  MachineFunction &MF = *MI.getParent()->getParent();
671
14.1k
  const TargetRegisterClass *RC;
672
14.1k
  if (AllowSP) {
673
5.07k
    RC = Opc != X86::LEA32r ? 
&X86::GR64RegClass3.27k
:
&X86::GR32RegClass1.79k
;
674
9.03k
  } else {
675
9.03k
    RC = Opc != X86::LEA32r ?
676
7.70k
      &X86::GR64_NOSPRegClass : 
&X86::GR32_NOSPRegClass1.33k
;
677
9.03k
  }
678
14.1k
  unsigned SrcReg = Src.getReg();
679
14.1k
680
14.1k
  // For both LEA64 and LEA32 the register already has essentially the right
681
14.1k
  // type (32-bit or 64-bit) we may just need to forbid SP.
682
14.1k
  if (Opc != X86::LEA64_32r) {
683
11.4k
    NewSrc = SrcReg;
684
11.4k
    isKill = Src.isKill();
685
11.4k
    assert(!Src.isUndef() && "Undef op doesn't need optimization");
686
11.4k
687
11.4k
    if (TargetRegisterInfo::isVirtualRegister(NewSrc) &&
688
11.4k
        
!MF.getRegInfo().constrainRegClass(NewSrc, RC)11.4k
)
689
0
      return false;
690
11.4k
691
11.4k
    return true;
692
11.4k
  }
693
2.68k
694
2.68k
  // This is for an LEA64_32r and incoming registers are 32-bit. One way or
695
2.68k
  // another we need to add 64-bit registers to the final MI.
696
2.68k
  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
697
0
    ImplicitOp = Src;
698
0
    ImplicitOp.setImplicit();
699
0
700
0
    NewSrc = getX86SubSuperRegister(Src.getReg(), 64);
701
0
    isKill = Src.isKill();
702
0
    assert(!Src.isUndef() && "Undef op doesn't need optimization");
703
2.68k
  } else {
704
2.68k
    // Virtual register of the wrong class, we have to create a temporary 64-bit
705
2.68k
    // vreg to feed into the LEA.
706
2.68k
    NewSrc = MF.getRegInfo().createVirtualRegister(RC);
707
2.68k
    MachineInstr *Copy =
708
2.68k
        BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
709
2.68k
            .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
710
2.68k
            .add(Src);
711
2.68k
712
2.68k
    // Which is obviously going to be dead after we're done with it.
713
2.68k
    isKill = true;
714
2.68k
715
2.68k
    if (LV)
716
2.68k
      LV->replaceKillInstruction(SrcReg, MI, *Copy);
717
2.68k
  }
718
2.68k
719
2.68k
  // We've set all the parameters without issue.
720
2.68k
  return true;
721
2.68k
}
722
723
MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
724
    unsigned MIOpc, MachineFunction::iterator &MFI, MachineInstr &MI,
725
1.04k
    LiveVariables *LV, bool Is8BitOp) const {
726
1.04k
  // We handle 8-bit adds and various 16-bit opcodes in the switch below.
727
1.04k
  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
728
1.04k
  assert((Is8BitOp || RegInfo.getTargetRegisterInfo()->getRegSizeInBits(
729
1.04k
              *RegInfo.getRegClass(MI.getOperand(0).getReg())) == 16) &&
730
1.04k
         "Unexpected type for LEA transform");
731
1.04k
732
1.04k
  // TODO: For a 32-bit target, we need to adjust the LEA variables with
733
1.04k
  // something like this:
734
1.04k
  //   Opcode = X86::LEA32r;
735
1.04k
  //   InRegLEA = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
736
1.04k
  //   OutRegLEA =
737
1.04k
  //       Is8BitOp ? RegInfo.createVirtualRegister(&X86::GR32ABCD_RegClass)
738
1.04k
  //                : RegInfo.createVirtualRegister(&X86::GR32RegClass);
739
1.04k
  if (!Subtarget.is64Bit())
740
188
    return nullptr;
741
858
742
858
  unsigned Opcode = X86::LEA64_32r;
743
858
  unsigned InRegLEA = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
744
858
  unsigned OutRegLEA = RegInfo.createVirtualRegister(&X86::GR32RegClass);
745
858
746
858
  // Build and insert into an implicit UNDEF value. This is OK because
747
858
  // we will be shifting and then extracting the lower 8/16-bits.
748
858
  // This has the potential to cause partial register stall. e.g.
749
858
  //   movw    (%rbp,%rcx,2), %dx
750
858
  //   leal    -65(%rdx), %esi
751
858
  // But testing has shown this *does* help performance in 64-bit mode (at
752
858
  // least on modern x86 machines).
753
858
  MachineBasicBlock::iterator MBBI = MI.getIterator();
754
858
  unsigned Dest = MI.getOperand(0).getReg();
755
858
  unsigned Src = MI.getOperand(1).getReg();
756
858
  bool IsDead = MI.getOperand(0).isDead();
757
858
  bool IsKill = MI.getOperand(1).isKill();
758
858
  unsigned SubReg = Is8BitOp ? 
X86::sub_8bit855
:
X86::sub_16bit3
;
759
858
  assert(!MI.getOperand(1).isUndef() && "Undef op doesn't need optimization");
760
858
  BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA);
761
858
  MachineInstr *InsMI =
762
858
      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
763
858
          .addReg(InRegLEA, RegState::Define, SubReg)
764
858
          .addReg(Src, getKillRegState(IsKill));
765
858
766
858
  MachineInstrBuilder MIB =
767
858
      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(Opcode), OutRegLEA);
768
858
  switch (MIOpc) {
769
858
  
default: 0
llvm_unreachable0
("Unreachable!");
770
858
  case X86::SHL8ri:
771
29
  case X86::SHL16ri: {
772
29
    unsigned ShAmt = MI.getOperand(2).getImm();
773
29
    MIB.addReg(0).addImm(1ULL << ShAmt)
774
29
       .addReg(InRegLEA, RegState::Kill).addImm(0).addReg(0);
775
29
    break;
776
29
  }
777
75
  case X86::INC8r:
778
75
  case X86::INC16r:
779
75
    addRegOffset(MIB, InRegLEA, true, 1);
780
75
    break;
781
75
  case X86::DEC8r:
782
0
  case X86::DEC16r:
783
0
    addRegOffset(MIB, InRegLEA, true, -1);
784
0
    break;
785
431
  case X86::ADD8ri:
786
431
  case X86::ADD8ri_DB:
787
431
  case X86::ADD16ri:
788
431
  case X86::ADD16ri8:
789
431
  case X86::ADD16ri_DB:
790
431
  case X86::ADD16ri8_DB:
791
431
    addRegOffset(MIB, InRegLEA, true, MI.getOperand(2).getImm());
792
431
    break;
793
431
  case X86::ADD8rr:
794
323
  case X86::ADD8rr_DB:
795
323
  case X86::ADD16rr:
796
323
  case X86::ADD16rr_DB: {
797
323
    unsigned Src2 = MI.getOperand(2).getReg();
798
323
    bool IsKill2 = MI.getOperand(2).isKill();
799
323
    assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
800
323
    unsigned InRegLEA2 = 0;
801
323
    MachineInstr *InsMI2 = nullptr;
802
323
    if (Src == Src2) {
803
285
      // ADD8rr/ADD16rr killed %reg1028, %reg1028
804
285
      // just a single insert_subreg.
805
285
      addRegReg(MIB, InRegLEA, true, InRegLEA, false);
806
285
    } else {
807
38
      if (Subtarget.is64Bit())
808
38
        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
809
0
      else
810
0
        InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR32_NOSPRegClass);
811
38
      // Build and insert into an implicit UNDEF value. This is OK because
812
38
      // we will be shifting and then extracting the lower 8/16-bits.
813
38
      BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(X86::IMPLICIT_DEF), InRegLEA2);
814
38
      InsMI2 = BuildMI(*MFI, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
815
38
                   .addReg(InRegLEA2, RegState::Define, SubReg)
816
38
                   .addReg(Src2, getKillRegState(IsKill2));
817
38
      addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
818
38
    }
819
323
    if (LV && IsKill2 && 
InsMI235
)
820
35
      LV->replaceKillInstruction(Src2, MI, *InsMI2);
821
323
    break;
822
858
  }
823
858
  }
824
858
825
858
  MachineInstr *NewMI = MIB;
826
858
  MachineInstr *ExtMI =
827
858
      BuildMI(*MFI, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
828
858
          .addReg(Dest, RegState::Define | getDeadRegState(IsDead))
829
858
          .addReg(OutRegLEA, RegState::Kill, SubReg);
830
858
831
858
  if (LV) {
832
858
    // Update live variables.
833
858
    LV->getVarInfo(InRegLEA).Kills.push_back(NewMI);
834
858
    LV->getVarInfo(OutRegLEA).Kills.push_back(ExtMI);
835
858
    if (IsKill)
836
623
      LV->replaceKillInstruction(Src, MI, *InsMI);
837
858
    if (IsDead)
838
0
      LV->replaceKillInstruction(Dest, MI, *ExtMI);
839
858
  }
840
858
841
858
  return ExtMI;
842
858
}
843
844
/// This method must be implemented by targets that
845
/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
846
/// may be able to convert a two-address instruction into a true
847
/// three-address instruction on demand.  This allows the X86 target (for
848
/// example) to convert ADD and SHL instructions into LEA instructions if they
849
/// would require register copies due to two-addressness.
850
///
851
/// This method returns a null pointer if the transformation cannot be
852
/// performed, otherwise it returns the new instruction.
853
///
854
MachineInstr *
855
X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
856
27.6k
                                    MachineInstr &MI, LiveVariables *LV) const {
857
27.6k
  // The following opcodes also sets the condition code register(s). Only
858
27.6k
  // convert them to equivalent lea if the condition code register def's
859
27.6k
  // are dead!
860
27.6k
  if (hasLiveCondCodeDef(MI))
861
959
    return nullptr;
862
26.7k
863
26.7k
  MachineFunction &MF = *MI.getParent()->getParent();
864
26.7k
  // All instructions input are two-addr instructions.  Get the known operands.
865
26.7k
  const MachineOperand &Dest = MI.getOperand(0);
866
26.7k
  const MachineOperand &Src = MI.getOperand(1);
867
26.7k
868
26.7k
  // Ideally, operations with undef should be folded before we get here, but we
869
26.7k
  // can't guarantee it. Bail out because optimizing undefs is a waste of time.
870
26.7k
  // Without this, we have to forward undef state to new register operands to
871
26.7k
  // avoid machine verifier errors.
872
26.7k
  if (Src.isUndef())
873
0
    return nullptr;
874
26.7k
  if (MI.getNumOperands() > 2)
875
26.7k
    if (MI.getOperand(2).isReg() && 
MI.getOperand(2).isUndef()9.80k
)
876
3
      return nullptr;
877
26.6k
878
26.6k
  MachineInstr *NewMI = nullptr;
879
26.6k
  bool Is64Bit = Subtarget.is64Bit();
880
26.6k
881
26.6k
  bool Is8BitOp = false;
882
26.6k
  unsigned MIOpc = MI.getOpcode();
883
26.6k
  switch (MIOpc) {
884
26.6k
  
default: 0
llvm_unreachable0
("Unreachable!");
885
26.6k
  case X86::SHL64ri: {
886
2.28k
    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
887
2.28k
    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
888
2.28k
    if (!isTruncatedShiftCountForLEA(ShAmt)) 
return nullptr1.29k
;
889
987
890
987
    // LEA can't handle RSP.
891
987
    if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
892
987
        !MF.getRegInfo().constrainRegClass(Src.getReg(),
893
987
                                           &X86::GR64_NOSPRegClass))
894
0
      return nullptr;
895
987
896
987
    NewMI = BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r))
897
987
                .add(Dest)
898
987
                .addReg(0)
899
987
                .addImm(1ULL << ShAmt)
900
987
                .add(Src)
901
987
                .addImm(0)
902
987
                .addReg(0);
903
987
    break;
904
987
  }
905
987
  case X86::SHL32ri: {
906
672
    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
907
672
    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
908
672
    if (!isTruncatedShiftCountForLEA(ShAmt)) 
return nullptr455
;
909
217
910
217
    unsigned Opc = Is64Bit ? 
X86::LEA64_32r133
:
X86::LEA32r84
;
911
217
912
217
    // LEA can't handle ESP.
913
217
    bool isKill;
914
217
    unsigned SrcReg;
915
217
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
916
217
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
917
217
                        SrcReg, isKill, ImplicitOp, LV))
918
0
      return nullptr;
919
217
920
217
    MachineInstrBuilder MIB =
921
217
        BuildMI(MF, MI.getDebugLoc(), get(Opc))
922
217
            .add(Dest)
923
217
            .addReg(0)
924
217
            .addImm(1ULL << ShAmt)
925
217
            .addReg(SrcReg, getKillRegState(isKill))
926
217
            .addImm(0)
927
217
            .addReg(0);
928
217
    if (ImplicitOp.getReg() != 0)
929
0
      MIB.add(ImplicitOp);
930
217
    NewMI = MIB;
931
217
932
217
    break;
933
217
  }
934
217
  case X86::SHL8ri:
935
69
    Is8BitOp = true;
936
69
    LLVM_FALLTHROUGH;
937
71
  case X86::SHL16ri: {
938
71
    assert(MI.getNumOperands() >= 3 && "Unknown shift instruction!");
939
71
    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
940
71
    if (!isTruncatedShiftCountForLEA(ShAmt))
941
42
      return nullptr;
942
29
    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
943
29
  }
944
3.44k
  case X86::INC64r:
945
3.44k
  case X86::INC32r: {
946
3.44k
    assert(MI.getNumOperands() >= 2 && "Unknown inc instruction!");
947
3.44k
    unsigned Opc = MIOpc == X86::INC64r ? 
X86::LEA64r2.43k
:
948
3.44k
        
(Is64Bit 1.00k
?
X86::LEA64_32r604
:
X86::LEA32r401
);
949
3.44k
    bool isKill;
950
3.44k
    unsigned SrcReg;
951
3.44k
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
952
3.44k
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
953
3.44k
                        ImplicitOp, LV))
954
0
      return nullptr;
955
3.44k
956
3.44k
    MachineInstrBuilder MIB =
957
3.44k
        BuildMI(MF, MI.getDebugLoc(), get(Opc))
958
3.44k
            .add(Dest)
959
3.44k
            .addReg(SrcReg, getKillRegState(isKill));
960
3.44k
    if (ImplicitOp.getReg() != 0)
961
0
      MIB.add(ImplicitOp);
962
3.44k
963
3.44k
    NewMI = addOffset(MIB, 1);
964
3.44k
    break;
965
3.44k
  }
966
3.44k
  case X86::DEC64r:
967
2.49k
  case X86::DEC32r: {
968
2.49k
    assert(MI.getNumOperands() >= 2 && "Unknown dec instruction!");
969
2.49k
    unsigned Opc = MIOpc == X86::DEC64r ? 
X86::LEA64r1.71k
970
2.49k
        : 
(Is64Bit 782
?
X86::LEA64_32r356
:
X86::LEA32r426
);
971
2.49k
972
2.49k
    bool isKill;
973
2.49k
    unsigned SrcReg;
974
2.49k
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
975
2.49k
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false, SrcReg, isKill,
976
2.49k
                        ImplicitOp, LV))
977
0
      return nullptr;
978
2.49k
979
2.49k
    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
980
2.49k
                                  .add(Dest)
981
2.49k
                                  .addReg(SrcReg, getKillRegState(isKill));
982
2.49k
    if (ImplicitOp.getReg() != 0)
983
0
      MIB.add(ImplicitOp);
984
2.49k
985
2.49k
    NewMI = addOffset(MIB, -1);
986
2.49k
987
2.49k
    break;
988
2.49k
  }
989
2.49k
  case X86::DEC8r:
990
126
  case X86::INC8r:
991
126
    Is8BitOp = true;
992
126
    LLVM_FALLTHROUGH;
993
126
  case X86::DEC16r:
994
126
  case X86::INC16r:
995
126
    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
996
2.87k
  case X86::ADD64rr:
997
2.87k
  case X86::ADD64rr_DB:
998
2.87k
  case X86::ADD32rr:
999
2.87k
  case X86::ADD32rr_DB: {
1000
2.87k
    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1001
2.87k
    unsigned Opc;
1002
2.87k
    if (MIOpc == X86::ADD64rr || 
MIOpc == X86::ADD64rr_DB824
)
1003
2.07k
      Opc = X86::LEA64r;
1004
809
    else
1005
809
      Opc = Is64Bit ? 
X86::LEA64_32r385
:
X86::LEA32r424
;
1006
2.87k
1007
2.87k
    bool isKill;
1008
2.87k
    unsigned SrcReg;
1009
2.87k
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1010
2.87k
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1011
2.87k
                        SrcReg, isKill, ImplicitOp, LV))
1012
0
      return nullptr;
1013
2.87k
1014
2.87k
    const MachineOperand &Src2 = MI.getOperand(2);
1015
2.87k
    bool isKill2;
1016
2.87k
    unsigned SrcReg2;
1017
2.87k
    MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
1018
2.87k
    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
1019
2.87k
                        SrcReg2, isKill2, ImplicitOp2, LV))
1020
0
      return nullptr;
1021
2.87k
1022
2.87k
    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc)).add(Dest);
1023
2.87k
    if (ImplicitOp.getReg() != 0)
1024
0
      MIB.add(ImplicitOp);
1025
2.87k
    if (ImplicitOp2.getReg() != 0)
1026
0
      MIB.add(ImplicitOp2);
1027
2.87k
1028
2.87k
    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
1029
2.87k
    if (LV && 
Src2.isKill()2.87k
)
1030
218
      LV->replaceKillInstruction(SrcReg2, MI, *NewMI);
1031
2.87k
    break;
1032
2.87k
  }
1033
2.87k
  case X86::ADD8rr:
1034
327
  case X86::ADD8rr_DB:
1035
327
    Is8BitOp = true;
1036
327
    LLVM_FALLTHROUGH;
1037
329
  case X86::ADD16rr:
1038
329
  case X86::ADD16rr_DB:
1039
329
    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1040
11.0k
  case X86::ADD64ri32:
1041
11.0k
  case X86::ADD64ri8:
1042
11.0k
  case X86::ADD64ri32_DB:
1043
11.0k
  case X86::ADD64ri8_DB:
1044
11.0k
    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1045
11.0k
    NewMI = addOffset(
1046
11.0k
        BuildMI(MF, MI.getDebugLoc(), get(X86::LEA64r)).add(Dest).add(Src),
1047
11.0k
        MI.getOperand(2));
1048
11.0k
    break;
1049
11.0k
  case X86::ADD32ri:
1050
2.17k
  case X86::ADD32ri8:
1051
2.17k
  case X86::ADD32ri_DB:
1052
2.17k
  case X86::ADD32ri8_DB: {
1053
2.17k
    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1054
2.17k
    unsigned Opc = Is64Bit ? 
X86::LEA64_32r813
:
X86::LEA32r1.35k
;
1055
2.17k
1056
2.17k
    bool isKill;
1057
2.17k
    unsigned SrcReg;
1058
2.17k
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1059
2.17k
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1060
2.17k
                        SrcReg, isKill, ImplicitOp, LV))
1061
0
      return nullptr;
1062
2.17k
1063
2.17k
    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1064
2.17k
                                  .add(Dest)
1065
2.17k
                                  .addReg(SrcReg, getKillRegState(isKill));
1066
2.17k
    if (ImplicitOp.getReg() != 0)
1067
0
      MIB.add(ImplicitOp);
1068
2.17k
1069
2.17k
    NewMI = addOffset(MIB, MI.getOperand(2));
1070
2.17k
    break;
1071
2.17k
  }
1072
2.17k
  case X86::ADD8ri:
1073
562
  case X86::ADD8ri_DB:
1074
562
    Is8BitOp = true;
1075
562
    LLVM_FALLTHROUGH;
1076
562
  case X86::ADD16ri:
1077
562
  case X86::ADD16ri8:
1078
562
  case X86::ADD16ri_DB:
1079
562
  case X86::ADD16ri8_DB:
1080
562
    return convertToThreeAddressWithLEA(MIOpc, MFI, MI, LV, Is8BitOp);
1081
562
  case X86::SUB8ri:
1082
0
  case X86::SUB16ri8:
1083
0
  case X86::SUB16ri:
1084
0
    /// FIXME: Support these similar to ADD8ri/ADD16ri*.
1085
0
    return nullptr;
1086
24
  case X86::SUB32ri8:
1087
24
  case X86::SUB32ri: {
1088
24
    int64_t Imm = MI.getOperand(2).getImm();
1089
24
    if (!isInt<32>(-Imm))
1090
0
      return nullptr;
1091
24
1092
24
    assert(MI.getNumOperands() >= 3 && "Unknown add instruction!");
1093
24
    unsigned Opc = Is64Bit ? 
X86::LEA64_32r8
:
X86::LEA32r16
;
1094
24
1095
24
    bool isKill;
1096
24
    unsigned SrcReg;
1097
24
    MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
1098
24
    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
1099
24
                        SrcReg, isKill, ImplicitOp, LV))
1100
0
      return nullptr;
1101
24
1102
24
    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1103
24
                                  .add(Dest)
1104
24
                                  .addReg(SrcReg, getKillRegState(isKill));
1105
24
    if (ImplicitOp.getReg() != 0)
1106
0
      MIB.add(ImplicitOp);
1107
24
1108
24
    NewMI = addOffset(MIB, -Imm);
1109
24
    break;
1110
24
  }
1111
24
1112
92
  case X86::SUB64ri8:
1113
92
  case X86::SUB64ri32: {
1114
92
    int64_t Imm = MI.getOperand(2).getImm();
1115
92
    if (!isInt<32>(-Imm))
1116
2
      return nullptr;
1117
90
1118
90
    assert(MI.getNumOperands() >= 3 && "Unknown sub instruction!");
1119
90
1120
90
    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(),
1121
90
                                      get(X86::LEA64r)).add(Dest).add(Src);
1122
90
    NewMI = addOffset(MIB, -Imm);
1123
90
    break;
1124
90
  }
1125
90
1126
90
  case X86::VMOVDQU8Z128rmk:
1127
76
  case X86::VMOVDQU8Z256rmk:
1128
76
  case X86::VMOVDQU8Zrmk:
1129
76
  case X86::VMOVDQU16Z128rmk:
1130
76
  case X86::VMOVDQU16Z256rmk:
1131
76
  case X86::VMOVDQU16Zrmk:
1132
76
  case X86::VMOVDQU32Z128rmk: case X86::VMOVDQA32Z128rmk:
1133
76
  case X86::VMOVDQU32Z256rmk: case X86::VMOVDQA32Z256rmk:
1134
76
  case X86::VMOVDQU32Zrmk:    case X86::VMOVDQA32Zrmk:
1135
76
  case X86::VMOVDQU64Z128rmk: case X86::VMOVDQA64Z128rmk:
1136
76
  case X86::VMOVDQU64Z256rmk: case X86::VMOVDQA64Z256rmk:
1137
76
  case X86::VMOVDQU64Zrmk:    case X86::VMOVDQA64Zrmk:
1138
76
  case X86::VMOVUPDZ128rmk:   case X86::VMOVAPDZ128rmk:
1139
76
  case X86::VMOVUPDZ256rmk:   case X86::VMOVAPDZ256rmk:
1140
76
  case X86::VMOVUPDZrmk:      case X86::VMOVAPDZrmk:
1141
76
  case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
1142
76
  case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
1143
76
  case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk: {
1144
76
    unsigned Opc;
1145
76
    switch (MIOpc) {
1146
76
    
default: 0
llvm_unreachable0
("Unreachable!");
1147
76
    
case X86::VMOVDQU8Z128rmk: Opc = X86::VPBLENDMBZ128rmk; break1
;
1148
76
    
case X86::VMOVDQU8Z256rmk: Opc = X86::VPBLENDMBZ256rmk; break2
;
1149
76
    
case X86::VMOVDQU8Zrmk: Opc = X86::VPBLENDMBZrmk; break1
;
1150
76
    
case X86::VMOVDQU16Z128rmk: Opc = X86::VPBLENDMWZ128rmk; break1
;
1151
76
    
case X86::VMOVDQU16Z256rmk: Opc = X86::VPBLENDMWZ256rmk; break1
;
1152
76
    
case X86::VMOVDQU16Zrmk: Opc = X86::VPBLENDMWZrmk; break1
;
1153
76
    
case X86::VMOVDQU32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break3
;
1154
76
    
case X86::VMOVDQU32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break2
;
1155
76
    
case X86::VMOVDQU32Zrmk: Opc = X86::VPBLENDMDZrmk; break2
;
1156
76
    
case X86::VMOVDQU64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break2
;
1157
76
    
case X86::VMOVDQU64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break2
;
1158
76
    
case X86::VMOVDQU64Zrmk: Opc = X86::VPBLENDMQZrmk; break16
;
1159
76
    
case X86::VMOVUPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break2
;
1160
76
    
case X86::VMOVUPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break2
;
1161
76
    
case X86::VMOVUPDZrmk: Opc = X86::VBLENDMPDZrmk; break25
;
1162
76
    
case X86::VMOVUPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break4
;
1163
76
    
case X86::VMOVUPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break0
;
1164
76
    
case X86::VMOVUPSZrmk: Opc = X86::VBLENDMPSZrmk; break5
;
1165
76
    
case X86::VMOVDQA32Z128rmk: Opc = X86::VPBLENDMDZ128rmk; break0
;
1166
76
    
case X86::VMOVDQA32Z256rmk: Opc = X86::VPBLENDMDZ256rmk; break0
;
1167
76
    
case X86::VMOVDQA32Zrmk: Opc = X86::VPBLENDMDZrmk; break0
;
1168
76
    
case X86::VMOVDQA64Z128rmk: Opc = X86::VPBLENDMQZ128rmk; break0
;
1169
76
    
case X86::VMOVDQA64Z256rmk: Opc = X86::VPBLENDMQZ256rmk; break0
;
1170
76
    
case X86::VMOVDQA64Zrmk: Opc = X86::VPBLENDMQZrmk; break0
;
1171
76
    
case X86::VMOVAPDZ128rmk: Opc = X86::VBLENDMPDZ128rmk; break0
;
1172
76
    
case X86::VMOVAPDZ256rmk: Opc = X86::VBLENDMPDZ256rmk; break2
;
1173
76
    
case X86::VMOVAPDZrmk: Opc = X86::VBLENDMPDZrmk; break0
;
1174
76
    
case X86::VMOVAPSZ128rmk: Opc = X86::VBLENDMPSZ128rmk; break0
;
1175
76
    
case X86::VMOVAPSZ256rmk: Opc = X86::VBLENDMPSZ256rmk; break2
;
1176
76
    
case X86::VMOVAPSZrmk: Opc = X86::VBLENDMPSZrmk; break0
;
1177
76
    }
1178
76
1179
76
    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1180
76
              .add(Dest)
1181
76
              .add(MI.getOperand(2))
1182
76
              .add(Src)
1183
76
              .add(MI.getOperand(3))
1184
76
              .add(MI.getOperand(4))
1185
76
              .add(MI.getOperand(5))
1186
76
              .add(MI.getOperand(6))
1187
76
              .add(MI.getOperand(7));
1188
76
    break;
1189
76
  }
1190
455
  case X86::VMOVDQU8Z128rrk:
1191
455
  case X86::VMOVDQU8Z256rrk:
1192
455
  case X86::VMOVDQU8Zrrk:
1193
455
  case X86::VMOVDQU16Z128rrk:
1194
455
  case X86::VMOVDQU16Z256rrk:
1195
455
  case X86::VMOVDQU16Zrrk:
1196
455
  case X86::VMOVDQU32Z128rrk: case X86::VMOVDQA32Z128rrk:
1197
455
  case X86::VMOVDQU32Z256rrk: case X86::VMOVDQA32Z256rrk:
1198
455
  case X86::VMOVDQU32Zrrk:    case X86::VMOVDQA32Zrrk:
1199
455
  case X86::VMOVDQU64Z128rrk: case X86::VMOVDQA64Z128rrk:
1200
455
  case X86::VMOVDQU64Z256rrk: case X86::VMOVDQA64Z256rrk:
1201
455
  case X86::VMOVDQU64Zrrk:    case X86::VMOVDQA64Zrrk:
1202
455
  case X86::VMOVUPDZ128rrk:   case X86::VMOVAPDZ128rrk:
1203
455
  case X86::VMOVUPDZ256rrk:   case X86::VMOVAPDZ256rrk:
1204
455
  case X86::VMOVUPDZrrk:      case X86::VMOVAPDZrrk:
1205
455
  case X86::VMOVUPSZ128rrk:   case X86::VMOVAPSZ128rrk:
1206
455
  case X86::VMOVUPSZ256rrk:   case X86::VMOVAPSZ256rrk:
1207
455
  case X86::VMOVUPSZrrk:      case X86::VMOVAPSZrrk: {
1208
455
    unsigned Opc;
1209
455
    switch (MIOpc) {
1210
455
    
default: 0
llvm_unreachable0
("Unreachable!");
1211
455
    
case X86::VMOVDQU8Z128rrk: Opc = X86::VPBLENDMBZ128rrk; break10
;
1212
455
    
case X86::VMOVDQU8Z256rrk: Opc = X86::VPBLENDMBZ256rrk; break13
;
1213
455
    
case X86::VMOVDQU8Zrrk: Opc = X86::VPBLENDMBZrrk; break32
;
1214
455
    
case X86::VMOVDQU16Z128rrk: Opc = X86::VPBLENDMWZ128rrk; break18
;
1215
455
    
case X86::VMOVDQU16Z256rrk: Opc = X86::VPBLENDMWZ256rrk; break13
;
1216
455
    
case X86::VMOVDQU16Zrrk: Opc = X86::VPBLENDMWZrrk; break13
;
1217
455
    
case X86::VMOVDQU32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break0
;
1218
455
    
case X86::VMOVDQU32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break0
;
1219
455
    
case X86::VMOVDQU32Zrrk: Opc = X86::VPBLENDMDZrrk; break0
;
1220
455
    
case X86::VMOVDQU64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break0
;
1221
455
    
case X86::VMOVDQU64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break0
;
1222
455
    
case X86::VMOVDQU64Zrrk: Opc = X86::VPBLENDMQZrrk; break0
;
1223
455
    
case X86::VMOVUPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break0
;
1224
455
    
case X86::VMOVUPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break0
;
1225
455
    
case X86::VMOVUPDZrrk: Opc = X86::VBLENDMPDZrrk; break0
;
1226
455
    
case X86::VMOVUPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break0
;
1227
455
    
case X86::VMOVUPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break0
;
1228
455
    
case X86::VMOVUPSZrrk: Opc = X86::VBLENDMPSZrrk; break0
;
1229
455
    
case X86::VMOVDQA32Z128rrk: Opc = X86::VPBLENDMDZ128rrk; break32
;
1230
455
    
case X86::VMOVDQA32Z256rrk: Opc = X86::VPBLENDMDZ256rrk; break32
;
1231
455
    
case X86::VMOVDQA32Zrrk: Opc = X86::VPBLENDMDZrrk; break83
;
1232
455
    
case X86::VMOVDQA64Z128rrk: Opc = X86::VPBLENDMQZ128rrk; break16
;
1233
455
    
case X86::VMOVDQA64Z256rrk: Opc = X86::VPBLENDMQZ256rrk; break18
;
1234
455
    
case X86::VMOVDQA64Zrrk: Opc = X86::VPBLENDMQZrrk; break42
;
1235
455
    
case X86::VMOVAPDZ128rrk: Opc = X86::VBLENDMPDZ128rrk; break10
;
1236
455
    
case X86::VMOVAPDZ256rrk: Opc = X86::VBLENDMPDZ256rrk; break16
;
1237
455
    
case X86::VMOVAPDZrrk: Opc = X86::VBLENDMPDZrrk; break46
;
1238
455
    
case X86::VMOVAPSZ128rrk: Opc = X86::VBLENDMPSZ128rrk; break15
;
1239
455
    
case X86::VMOVAPSZ256rrk: Opc = X86::VBLENDMPSZ256rrk; break11
;
1240
455
    
case X86::VMOVAPSZrrk: Opc = X86::VBLENDMPSZrrk; break35
;
1241
455
    }
1242
455
1243
455
    NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
1244
455
              .add(Dest)
1245
455
              .add(MI.getOperand(2))
1246
455
              .add(Src)
1247
455
              .add(MI.getOperand(3));
1248
455
    break;
1249
455
  }
1250
23.8k
  }
1251
23.8k
1252
23.8k
  if (!NewMI) 
return nullptr0
;
1253
23.8k
1254
23.8k
  if (LV) {  // Update live variables
1255
23.8k
    if (Src.isKill())
1256
2.24k
      LV->replaceKillInstruction(Src.getReg(), MI, *NewMI);
1257
23.8k
    if (Dest.isDead())
1258
0
      LV->replaceKillInstruction(Dest.getReg(), MI, *NewMI);
1259
23.8k
  }
1260
23.8k
1261
23.8k
  MFI->insert(MI.getIterator(), NewMI); // Insert the new inst
1262
23.8k
  return NewMI;
1263
23.8k
}
1264
1265
/// This determines which of three possible cases of a three source commute
1266
/// the source indexes correspond to taking into account any mask operands.
1267
/// All prevents commuting a passthru operand. Returns -1 if the commute isn't
1268
/// possible.
1269
/// Case 0 - Possible to commute the first and second operands.
1270
/// Case 1 - Possible to commute the first and third operands.
1271
/// Case 2 - Possible to commute the second and third operands.
1272
static unsigned getThreeSrcCommuteCase(uint64_t TSFlags, unsigned SrcOpIdx1,
1273
7.30k
                                       unsigned SrcOpIdx2) {
1274
7.30k
  // Put the lowest index to SrcOpIdx1 to simplify the checks below.
1275
7.30k
  if (SrcOpIdx1 > SrcOpIdx2)
1276
180
    std::swap(SrcOpIdx1, SrcOpIdx2);
1277
7.30k
1278
7.30k
  unsigned Op1 = 1, Op2 = 2, Op3 = 3;
1279
7.30k
  if (X86II::isKMasked(TSFlags)) {
1280
2.82k
    Op2++;
1281
2.82k
    Op3++;
1282
2.82k
  }
1283
7.30k
1284
7.30k
  if (SrcOpIdx1 == Op1 && 
SrcOpIdx2 == Op22.07k
)
1285
1.87k
    return 0;
1286
5.42k
  if (SrcOpIdx1 == Op1 && 
SrcOpIdx2 == Op3196
)
1287
196
    return 1;
1288
5.22k
  if (SrcOpIdx1 == Op2 && SrcOpIdx2 == Op3)
1289
5.22k
    return 2;
1290
0
  llvm_unreachable("Unknown three src commute case.");
1291
0
}
1292
1293
unsigned X86InstrInfo::getFMA3OpcodeToCommuteOperands(
1294
    const MachineInstr &MI, unsigned SrcOpIdx1, unsigned SrcOpIdx2,
1295
6.97k
    const X86InstrFMA3Group &FMA3Group) const {
1296
6.97k
1297
6.97k
  unsigned Opc = MI.getOpcode();
1298
6.97k
1299
6.97k
  // TODO: Commuting the 1st operand of FMA*_Int requires some additional
1300
6.97k
  // analysis. The commute optimization is legal only if all users of FMA*_Int
1301
6.97k
  // use only the lowest element of the FMA*_Int instruction. Such analysis are
1302
6.97k
  // not implemented yet. So, just return 0 in that case.
1303
6.97k
  // When such analysis are available this place will be the right place for
1304
6.97k
  // calling it.
1305
6.97k
  assert(!(FMA3Group.isIntrinsic() && (SrcOpIdx1 == 1 || SrcOpIdx2 == 1)) &&
1306
6.97k
         "Intrinsic instructions can't commute operand 1");
1307
6.97k
1308
6.97k
  // Determine which case this commute is or if it can't be done.
1309
6.97k
  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1310
6.97k
                                         SrcOpIdx2);
1311
6.97k
  assert(Case < 3 && "Unexpected case number!");
1312
6.97k
1313
6.97k
  // Define the FMA forms mapping array that helps to map input FMA form
1314
6.97k
  // to output FMA form to preserve the operation semantics after
1315
6.97k
  // commuting the operands.
1316
6.97k
  const unsigned Form132Index = 0;
1317
6.97k
  const unsigned Form213Index = 1;
1318
6.97k
  const unsigned Form231Index = 2;
1319
6.97k
  static const unsigned FormMapping[][3] = {
1320
6.97k
    // 0: SrcOpIdx1 == 1 && SrcOpIdx2 == 2;
1321
6.97k
    // FMA132 A, C, b; ==> FMA231 C, A, b;
1322
6.97k
    // FMA213 B, A, c; ==> FMA213 A, B, c;
1323
6.97k
    // FMA231 C, A, b; ==> FMA132 A, C, b;
1324
6.97k
    { Form231Index, Form213Index, Form132Index },
1325
6.97k
    // 1: SrcOpIdx1 == 1 && SrcOpIdx2 == 3;
1326
6.97k
    // FMA132 A, c, B; ==> FMA132 B, c, A;
1327
6.97k
    // FMA213 B, a, C; ==> FMA231 C, a, B;
1328
6.97k
    // FMA231 C, a, B; ==> FMA213 B, a, C;
1329
6.97k
    { Form132Index, Form231Index, Form213Index },
1330
6.97k
    // 2: SrcOpIdx1 == 2 && SrcOpIdx2 == 3;
1331
6.97k
    // FMA132 a, C, B; ==> FMA213 a, B, C;
1332
6.97k
    // FMA213 b, A, C; ==> FMA132 b, C, A;
1333
6.97k
    // FMA231 c, A, B; ==> FMA231 c, B, A;
1334
6.97k
    { Form213Index, Form132Index, Form231Index }
1335
6.97k
  };
1336
6.97k
1337
6.97k
  unsigned FMAForms[3];
1338
6.97k
  FMAForms[0] = FMA3Group.get132Opcode();
1339
6.97k
  FMAForms[1] = FMA3Group.get213Opcode();
1340
6.97k
  FMAForms[2] = FMA3Group.get231Opcode();
1341
6.97k
  unsigned FormIndex;
1342
13.2k
  for (FormIndex = 0; FormIndex < 3; 
FormIndex++6.27k
)
1343
13.2k
    if (Opc == FMAForms[FormIndex])
1344
6.97k
      break;
1345
6.97k
1346
6.97k
  // Everything is ready, just adjust the FMA opcode and return it.
1347
6.97k
  FormIndex = FormMapping[Case][FormIndex];
1348
6.97k
  return FMAForms[FormIndex];
1349
6.97k
}
1350
1351
static void commuteVPTERNLOG(MachineInstr &MI, unsigned SrcOpIdx1,
1352
332
                             unsigned SrcOpIdx2) {
1353
332
  // Determine which case this commute is or if it can't be done.
1354
332
  unsigned Case = getThreeSrcCommuteCase(MI.getDesc().TSFlags, SrcOpIdx1,
1355
332
                                         SrcOpIdx2);
1356
332
  assert(Case < 3 && "Unexpected case value!");
1357
332
1358
332
  // For each case we need to swap two pairs of bits in the final immediate.
1359
332
  static const uint8_t SwapMasks[3][4] = {
1360
332
    { 0x04, 0x10, 0x08, 0x20 }, // Swap bits 2/4 and 3/5.
1361
332
    { 0x02, 0x10, 0x08, 0x40 }, // Swap bits 1/4 and 3/6.
1362
332
    { 0x02, 0x04, 0x20, 0x40 }, // Swap bits 1/2 and 5/6.
1363
332
  };
1364
332
1365
332
  uint8_t Imm = MI.getOperand(MI.getNumOperands()-1).getImm();
1366
332
  // Clear out the bits we are swapping.
1367
332
  uint8_t NewImm = Imm & ~(SwapMasks[Case][0] | SwapMasks[Case][1] |
1368
332
                           SwapMasks[Case][2] | SwapMasks[Case][3]);
1369
332
  // If the immediate had a bit of the pair set, then set the opposite bit.
1370
332
  if (Imm & SwapMasks[Case][0]) 
NewImm |= SwapMasks[Case][1]62
;
1371
332
  if (Imm & SwapMasks[Case][1]) 
NewImm |= SwapMasks[Case][0]80
;
1372
332
  if (Imm & SwapMasks[Case][2]) 
NewImm |= SwapMasks[Case][3]144
;
1373
332
  if (Imm & SwapMasks[Case][3]) 
NewImm |= SwapMasks[Case][2]160
;
1374
332
  MI.getOperand(MI.getNumOperands()-1).setImm(NewImm);
1375
332
}
1376
1377
// Returns true if this is a VPERMI2 or VPERMT2 instruction that can be
1378
// commuted.
1379
405k
static bool isCommutableVPERMV3Instruction(unsigned Opcode) {
1380
405k
#define VPERM_CASES(Suffix) \
1381
405k
  
case X86::VPERMI2##Suffix##128rr: 22.4k
case X86::VPERMT2##Suffix##128rr: \
1382
22.4k
  case X86::VPERMI2##Suffix##256rr:    case X86::VPERMT2##Suffix##256rr:    \
1383
22.4k
  case X86::VPERMI2##Suffix##rr:       case X86::VPERMT2##Suffix##rr:       \
1384
22.4k
  case X86::VPERMI2##Suffix##128rm:    case X86::VPERMT2##Suffix##128rm:    \
1385
22.4k
  case X86::VPERMI2##Suffix##256rm:    case X86::VPERMT2##Suffix##256rm:    \
1386
22.4k
  case X86::VPERMI2##Suffix##rm:       case X86::VPERMT2##Suffix##rm:       \
1387
22.4k
  case X86::VPERMI2##Suffix##128rrkz:  case X86::VPERMT2##Suffix##128rrkz:  \
1388
22.4k
  case X86::VPERMI2##Suffix##256rrkz:  case X86::VPERMT2##Suffix##256rrkz:  \
1389
22.4k
  case X86::VPERMI2##Suffix##rrkz:     case X86::VPERMT2##Suffix##rrkz:     \
1390
22.4k
  case X86::VPERMI2##Suffix##128rmkz:  case X86::VPERMT2##Suffix##128rmkz:  \
1391
22.4k
  case X86::VPERMI2##Suffix##256rmkz:  case X86::VPERMT2##Suffix##256rmkz:  \
1392
22.4k
  case X86::VPERMI2##Suffix##rmkz:     case X86::VPERMT2##Suffix##rmkz:
1393
405k
1394
405k
#define VPERM_CASES_BROADCAST(Suffix) \
1395
405k
  
VPERM_CASES14.9k
(Suffix) \
1396
14.9k
  case X86::VPERMI2##Suffix##128rmb:   case X86::VPERMT2##Suffix##128rmb:   \
1397
14.9k
  case X86::VPERMI2##Suffix##256rmb:   case X86::VPERMT2##Suffix##256rmb:   \
1398
14.9k
  case X86::VPERMI2##Suffix##rmb:      case X86::VPERMT2##Suffix##rmb:      \
1399
14.9k
  case X86::VPERMI2##Suffix##128rmbkz: case X86::VPERMT2##Suffix##128rmbkz: \
1400
14.9k
  case X86::VPERMI2##Suffix##256rmbkz: case X86::VPERMT2##Suffix##256rmbkz: \
1401
14.9k
  case X86::VPERMI2##Suffix##rmbkz:    case X86::VPERMT2##Suffix##rmbkz:
1402
405k
1403
405k
  switch (Opcode) {
1404
405k
  
default: return false401k
;
1405
405k
  
VPERM_CASES3.74k
(B)
1406
134k
  
VPERM_CASES_BROADCAST3.74k
(D)
1407
134k
  
VPERM_CASES_BROADCAST3.74k
(PD)
1408
134k
  
VPERM_CASES_BROADCAST3.74k
(PS)
1409
134k
  
VPERM_CASES_BROADCAST3.74k
(Q)
1410
134k
  
VPERM_CASES3.74k
(W)
1411
89.9k
    return true;
1412
405k
  }
1413
405k
#undef VPERM_CASES_BROADCAST
1414
405k
#undef VPERM_CASES
1415
405k
}
1416
1417
// Returns commuted opcode for VPERMI2 and VPERMT2 instructions by switching
1418
// from the I opcode to the T opcode and vice versa.
1419
3.74k
static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
1420
3.74k
#define VPERM_CASES(Orig, New) \
1421
3.74k
  
case X86::Orig##128rr: return X86::New449
##128rr; \
1422
95
  
case X86::Orig##128rrkz: return X86::New10
##128rrkz; \
1423
95
  
case X86::Orig##128rm: return X86::New57
##128rm; \
1424
107
  
case X86::Orig##128rmkz: return X86::New9
##128rmkz; \
1425
264
  case X86::Orig##256rr:    return X86::New##256rr;   \
1426
1.06k
  
case X86::Orig##256rrkz: return X86::New34
##256rrkz; \
1427
114
  
case X86::Orig##256rm: return X86::New59
##256rm; \
1428
261
  
case X86::Orig##256rmkz: return X86::New16
##256rmkz; \
1429
134
  
case X86::Orig##rr: return X86::New25
##rr; \
1430
794
  
case X86::Orig##rrkz: return X86::New18
##rrkz; \
1431
184
  
case X86::Orig##rm: return X86::New0
##rm; \
1432
8
  
case X86::Orig##rmkz: return X86::New0
##rmkz;
1433
3.74k
1434
3.74k
#define VPERM_CASES_BROADCAST(Orig, New) \
1435
3.74k
  
VPERM_CASES188
(Orig, New) \
1436
1
  
case X86::Orig##128rmb: return X86::New0
##128rmb; \
1437
1
  case X86::Orig##128rmbkz: return X86::New##128rmbkz; \
1438
1
  
case X86::Orig##256rmb: return X86::New0
##256rmb; \
1439
0
  case X86::Orig##256rmbkz: return X86::New##256rmbkz; \
1440
0
  case X86::Orig##rmb:      return X86::New##rmb;      \
1441
1
  case X86::Orig##rmbkz:    return X86::New##rmbkz;
1442
3.74k
1443
3.74k
  switch (Opcode) {
1444
3.74k
  
VPERM_CASES46
(VPERMI2B, VPERMT2B)
1445
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMI2D, VPERMT2D)
1446
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMI2PD, VPERMT2PD)
1447
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMI2PS, VPERMT2PS)
1448
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMI2Q, VPERMT2Q)
1449
3.74k
  
VPERM_CASES77
(VPERMI2W, VPERMT2W)
1450
3.74k
  
VPERM_CASES53
(VPERMT2B, VPERMI2B)
1451
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMT2D, VPERMI2D)
1452
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMT2PD, VPERMI2PD)
1453
3.74k
  
VPERM_CASES_BROADCAST1
(VPERMT2PS, VPERMI2PS)
1454
3.74k
  
VPERM_CASES_BROADCAST0
(VPERMT2Q, VPERMI2Q)
1455
3.74k
  
VPERM_CASES85
(VPERMT2W, VPERMI2W)
1456
3.74k
  }
1457
3.74k
1458
3.74k
  
llvm_unreachable0
("Unreachable!");
1459
3.74k
#undef VPERM_CASES_BROADCAST
1460
3.74k
#undef VPERM_CASES
1461
3.74k
}
1462
1463
MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
1464
                                                   unsigned OpIdx1,
1465
445k
                                                   unsigned OpIdx2) const {
1466
445k
  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
1467
49.4k
    if (NewMI)
1468
0
      return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
1469
49.4k
    return MI;
1470
49.4k
  };
1471
445k
1472
445k
  switch (MI.getOpcode()) {
1473
445k
  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
1474
1.37k
  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
1475
1.37k
  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
1476
1.37k
  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
1477
1.37k
  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
1478
1.37k
  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
1479
1.37k
    unsigned Opc;
1480
1.37k
    unsigned Size;
1481
1.37k
    switch (MI.getOpcode()) {
1482
1.37k
    
default: 0
llvm_unreachable0
("Unreachable!");
1483
1.37k
    
case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break16
;
1484
1.37k
    
case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break16
;
1485
1.37k
    
case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break480
;
1486
1.37k
    
case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break530
;
1487
1.37k
    
case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break150
;
1488
1.37k
    
case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break183
;
1489
1.37k
    }
1490
1.37k
    unsigned Amt = MI.getOperand(3).getImm();
1491
1.37k
    auto &WorkingMI = cloneIfNew(MI);
1492
1.37k
    WorkingMI.setDesc(get(Opc));
1493
1.37k
    WorkingMI.getOperand(3).setImm(Size - Amt);
1494
1.37k
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1495
1.37k
                                                   OpIdx1, OpIdx2);
1496
1.37k
  }
1497
1.37k
  case X86::PFSUBrr:
1498
24
  case X86::PFSUBRrr: {
1499
24
    // PFSUB  x, y: x = x - y
1500
24
    // PFSUBR x, y: x = y - x
1501
24
    unsigned Opc =
1502
24
        (X86::PFSUBRrr == MI.getOpcode() ? 
X86::PFSUBrr12
:
X86::PFSUBRrr12
);
1503
24
    auto &WorkingMI = cloneIfNew(MI);
1504
24
    WorkingMI.setDesc(get(Opc));
1505
24
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1506
24
                                                   OpIdx1, OpIdx2);
1507
24
  }
1508
1.19k
  case X86::BLENDPDrri:
1509
1.19k
  case X86::BLENDPSrri:
1510
1.19k
  case X86::VBLENDPDrri:
1511
1.19k
  case X86::VBLENDPSrri:
1512
1.19k
    // If we're optimizing for size, try to use MOVSD/MOVSS.
1513
1.19k
    if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
1514
16
      unsigned Mask, Opc;
1515
16
      switch (MI.getOpcode()) {
1516
16
      
default: 0
llvm_unreachable0
("Unreachable!");
1517
16
      
case X86::BLENDPDrri: Opc = X86::MOVSDrr; Mask = 0x03; break4
;
1518
16
      
case X86::BLENDPSrri: Opc = X86::MOVSSrr; Mask = 0x0F; break2
;
1519
16
      
case X86::VBLENDPDrri: Opc = X86::VMOVSDrr; Mask = 0x03; break2
;
1520
16
      
case X86::VBLENDPSrri: Opc = X86::VMOVSSrr; Mask = 0x0F; break8
;
1521
16
      }
1522
16
      if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
1523
16
        auto &WorkingMI = cloneIfNew(MI);
1524
16
        WorkingMI.setDesc(get(Opc));
1525
16
        WorkingMI.RemoveOperand(3);
1526
16
        return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
1527
16
                                                       /*NewMI=*/false,
1528
16
                                                       OpIdx1, OpIdx2);
1529
16
      }
1530
1.18k
    }
1531
1.18k
    LLVM_FALLTHROUGH;
1532
10.4k
  case X86::PBLENDWrri:
1533
10.4k
  case X86::VBLENDPDYrri:
1534
10.4k
  case X86::VBLENDPSYrri:
1535
10.4k
  case X86::VPBLENDDrri:
1536
10.4k
  case X86::VPBLENDWrri:
1537
10.4k
  case X86::VPBLENDDYrri:
1538
10.4k
  case X86::VPBLENDWYrri:{
1539
10.4k
    int8_t Mask;
1540
10.4k
    switch (MI.getOpcode()) {
1541
10.4k
    
default: 0
llvm_unreachable0
("Unreachable!");
1542
10.4k
    
case X86::BLENDPDrri: Mask = (int8_t)0x03; break147
;
1543
10.4k
    
case X86::BLENDPSrri: Mask = (int8_t)0x0F; break269
;
1544
10.4k
    
case X86::PBLENDWrri: Mask = (int8_t)0xFF; break2.44k
;
1545
10.4k
    
case X86::VBLENDPDrri: Mask = (int8_t)0x03; break267
;
1546
10.4k
    
case X86::VBLENDPSrri: Mask = (int8_t)0x0F; break498
;
1547
10.4k
    
case X86::VBLENDPDYrri: Mask = (int8_t)0x0F; break362
;
1548
10.4k
    
case X86::VBLENDPSYrri: Mask = (int8_t)0xFF; break668
;
1549
10.4k
    
case X86::VPBLENDDrri: Mask = (int8_t)0x0F; break1.40k
;
1550
10.4k
    
case X86::VPBLENDWrri: Mask = (int8_t)0xFF; break3.08k
;
1551
10.4k
    
case X86::VPBLENDDYrri: Mask = (int8_t)0xFF; break736
;
1552
10.4k
    
case X86::VPBLENDWYrri: Mask = (int8_t)0xFF; break518
;
1553
10.4k
    }
1554
10.4k
    // Only the least significant bits of Imm are used.
1555
10.4k
    // Using int8_t to ensure it will be sign extended to the int64_t that
1556
10.4k
    // setImm takes in order to match isel behavior.
1557
10.4k
    int8_t Imm = MI.getOperand(3).getImm() & Mask;
1558
10.4k
    auto &WorkingMI = cloneIfNew(MI);
1559
10.4k
    WorkingMI.getOperand(3).setImm(Mask ^ Imm);
1560
10.4k
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1561
10.4k
                                                   OpIdx1, OpIdx2);
1562
10.4k
  }
1563
10.4k
  case X86::INSERTPSrr:
1564
1.64k
  case X86::VINSERTPSrr:
1565
1.64k
  case X86::VINSERTPSZrr: {
1566
1.64k
    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
1567
1.64k
    unsigned ZMask = Imm & 15;
1568
1.64k
    unsigned DstIdx = (Imm >> 4) & 3;
1569
1.64k
    unsigned SrcIdx = (Imm >> 6) & 3;
1570
1.64k
1571
1.64k
    // We can commute insertps if we zero 2 of the elements, the insertion is
1572
1.64k
    // "inline" and we don't override the insertion with a zero.
1573
1.64k
    if (DstIdx == SrcIdx && 
(ZMask & (1 << DstIdx)) == 075
&&
1574
1.64k
        
countPopulation(ZMask) == 275
) {
1575
60
      unsigned AltIdx = findFirstSet((ZMask | (1 << DstIdx)) ^ 15);
1576
60
      assert(AltIdx < 4 && "Illegal insertion index");
1577
60
      unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
1578
60
      auto &WorkingMI = cloneIfNew(MI);
1579
60
      WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
1580
60
      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1581
60
                                                     OpIdx1, OpIdx2);
1582
60
    }
1583
1.58k
    return nullptr;
1584
1.58k
  }
1585
1.58k
  case X86::MOVSDrr:
1586
707
  case X86::MOVSSrr:
1587
707
  case X86::VMOVSDrr:
1588
707
  case X86::VMOVSSrr:{
1589
707
    // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
1590
707
    if (Subtarget.hasSSE41()) {
1591
16
      unsigned Mask, Opc;
1592
16
      switch (MI.getOpcode()) {
1593
16
      
default: 0
llvm_unreachable0
("Unreachable!");
1594
16
      
case X86::MOVSDrr: Opc = X86::BLENDPDrri; Mask = 0x02; break4
;
1595
16
      
case X86::MOVSSrr: Opc = X86::BLENDPSrri; Mask = 0x0E; break2
;
1596
16
      
case X86::VMOVSDrr: Opc = X86::VBLENDPDrri; Mask = 0x02; break2
;
1597
16
      
case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break8
;
1598
16
      }
1599
16
1600
16
      auto &WorkingMI = cloneIfNew(MI);
1601
16
      WorkingMI.setDesc(get(Opc));
1602
16
      WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
1603
16
      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1604
16
                                                     OpIdx1, OpIdx2);
1605
16
    }
1606
691
1607
691
    // Convert to SHUFPD.
1608
691
    assert(MI.getOpcode() == X86::MOVSDrr &&
1609
691
           "Can only commute MOVSDrr without SSE4.1");
1610
691
1611
691
    auto &WorkingMI = cloneIfNew(MI);
1612
691
    WorkingMI.setDesc(get(X86::SHUFPDrri));
1613
691
    WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
1614
691
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1615
691
                                                   OpIdx1, OpIdx2);
1616
691
  }
1617
691
  case X86::SHUFPDrri: {
1618
620
    // Commute to MOVSD.
1619
620
    assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
1620
620
    auto &WorkingMI = cloneIfNew(MI);
1621
620
    WorkingMI.setDesc(get(X86::MOVSDrr));
1622
620
    WorkingMI.RemoveOperand(3);
1623
620
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1624
620
                                                   OpIdx1, OpIdx2);
1625
691
  }
1626
691
  case X86::PCLMULQDQrr:
1627
62
  case X86::VPCLMULQDQrr:
1628
62
  case X86::VPCLMULQDQYrr:
1629
62
  case X86::VPCLMULQDQZrr:
1630
62
  case X86::VPCLMULQDQZ128rr:
1631
62
  case X86::VPCLMULQDQZ256rr: {
1632
62
    // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
1633
62
    // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
1634
62
    unsigned Imm = MI.getOperand(3).getImm();
1635
62
    unsigned Src1Hi = Imm & 0x01;
1636
62
    unsigned Src2Hi = Imm & 0x10;
1637
62
    auto &WorkingMI = cloneIfNew(MI);
1638
62
    WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
1639
62
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1640
62
                                                   OpIdx1, OpIdx2);
1641
62
  }
1642
2.15k
  case X86::VPCMPBZ128rri:  case X86::VPCMPUBZ128rri:
1643
2.15k
  case X86::VPCMPBZ256rri:  case X86::VPCMPUBZ256rri:
1644
2.15k
  case X86::VPCMPBZrri:     case X86::VPCMPUBZrri:
1645
2.15k
  case X86::VPCMPDZ128rri:  case X86::VPCMPUDZ128rri:
1646
2.15k
  case X86::VPCMPDZ256rri:  case X86::VPCMPUDZ256rri:
1647
2.15k
  case X86::VPCMPDZrri:     case X86::VPCMPUDZrri:
1648
2.15k
  case X86::VPCMPQZ128rri:  case X86::VPCMPUQZ128rri:
1649
2.15k
  case X86::VPCMPQZ256rri:  case X86::VPCMPUQZ256rri:
1650
2.15k
  case X86::VPCMPQZrri:     case X86::VPCMPUQZrri:
1651
2.15k
  case X86::VPCMPWZ128rri:  case X86::VPCMPUWZ128rri:
1652
2.15k
  case X86::VPCMPWZ256rri:  case X86::VPCMPUWZ256rri:
1653
2.15k
  case X86::VPCMPWZrri:     case X86::VPCMPUWZrri:
1654
2.15k
  case X86::VPCMPBZ128rrik: case X86::VPCMPUBZ128rrik:
1655
2.15k
  case X86::VPCMPBZ256rrik: case X86::VPCMPUBZ256rrik:
1656
2.15k
  case X86::VPCMPBZrrik:    case X86::VPCMPUBZrrik:
1657
2.15k
  case X86::VPCMPDZ128rrik: case X86::VPCMPUDZ128rrik:
1658
2.15k
  case X86::VPCMPDZ256rrik: case X86::VPCMPUDZ256rrik:
1659
2.15k
  case X86::VPCMPDZrrik:    case X86::VPCMPUDZrrik:
1660
2.15k
  case X86::VPCMPQZ128rrik: case X86::VPCMPUQZ128rrik:
1661
2.15k
  case X86::VPCMPQZ256rrik: case X86::VPCMPUQZ256rrik:
1662
2.15k
  case X86::VPCMPQZrrik:    case X86::VPCMPUQZrrik:
1663
2.15k
  case X86::VPCMPWZ128rrik: case X86::VPCMPUWZ128rrik:
1664
2.15k
  case X86::VPCMPWZ256rrik: case X86::VPCMPUWZ256rrik:
1665
2.15k
  case X86::VPCMPWZrrik:    case X86::VPCMPUWZrrik: {
1666
2.15k
    // Flip comparison mode immediate (if necessary).
1667
2.15k
    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
1668
2.15k
    Imm = X86::getSwappedVPCMPImm(Imm);
1669
2.15k
    auto &WorkingMI = cloneIfNew(MI);
1670
2.15k
    WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
1671
2.15k
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1672
2.15k
                                                   OpIdx1, OpIdx2);
1673
2.15k
  }
1674
2.15k
  
case X86::VPCOMBri: 796
case X86::VPCOMUBri:
1675
796
  case X86::VPCOMDri: case X86::VPCOMUDri:
1676
796
  case X86::VPCOMQri: case X86::VPCOMUQri:
1677
796
  case X86::VPCOMWri: case X86::VPCOMUWri: {
1678
796
    // Flip comparison mode immediate (if necessary).
1679
796
    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
1680
796
    Imm = X86::getSwappedVPCOMImm(Imm);
1681
796
    auto &WorkingMI = cloneIfNew(MI);
1682
796
    WorkingMI.getOperand(3).setImm(Imm);
1683
796
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1684
796
                                                   OpIdx1, OpIdx2);
1685
796
  }
1686
796
  case X86::VPERM2F128rr:
1687
590
  case X86::VPERM2I128rr: {
1688
590
    // Flip permute source immediate.
1689
590
    // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
1690
590
    // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
1691
590
    int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
1692
590
    auto &WorkingMI = cloneIfNew(MI);
1693
590
    WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
1694
590
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1695
590
                                                   OpIdx1, OpIdx2);
1696
590
  }
1697
1.97k
  case X86::MOVHLPSrr:
1698
1.97k
  case X86::UNPCKHPDrr:
1699
1.97k
  case X86::VMOVHLPSrr:
1700
1.97k
  case X86::VUNPCKHPDrr:
1701
1.97k
  case X86::VMOVHLPSZrr:
1702
1.97k
  case X86::VUNPCKHPDZ128rr: {
1703
1.97k
    assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
1704
1.97k
1705
1.97k
    unsigned Opc = MI.getOpcode();
1706
1.97k
    switch (Opc) {
1707
1.97k
    
default: 0
llvm_unreachable0
("Unreachable!");
1708
1.97k
    
case X86::MOVHLPSrr: Opc = X86::UNPCKHPDrr; break795
;
1709
1.97k
    
case X86::UNPCKHPDrr: Opc = X86::MOVHLPSrr; break980
;
1710
1.97k
    
case X86::VMOVHLPSrr: Opc = X86::VUNPCKHPDrr; break71
;
1711
1.97k
    
case X86::VUNPCKHPDrr: Opc = X86::VMOVHLPSrr; break79
;
1712
1.97k
    
case X86::VMOVHLPSZrr: Opc = X86::VUNPCKHPDZ128rr; break26
;
1713
1.97k
    
case X86::VUNPCKHPDZ128rr: Opc = X86::VMOVHLPSZrr; break27
;
1714
1.97k
    }
1715
1.97k
    auto &WorkingMI = cloneIfNew(MI);
1716
1.97k
    WorkingMI.setDesc(get(Opc));
1717
1.97k
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1718
1.97k
                                                   OpIdx1, OpIdx2);
1719
1.97k
  }
1720
19.5k
  case X86::CMOV16rr:  case X86::CMOV32rr:  case X86::CMOV64rr: {
1721
19.5k
    auto &WorkingMI = cloneIfNew(MI);
1722
19.5k
    unsigned OpNo = MI.getDesc().getNumOperands() - 1;
1723
19.5k
    X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
1724
19.5k
    WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
1725
19.5k
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1726
19.5k
                                                   OpIdx1, OpIdx2);
1727
19.5k
  }
1728
19.5k
  
case X86::VPTERNLOGDZrri: 332
case X86::VPTERNLOGDZrmi:
1729
332
  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
1730
332
  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
1731
332
  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
1732
332
  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
1733
332
  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
1734
332
  case X86::VPTERNLOGDZrrik:
1735
332
  case X86::VPTERNLOGDZ128rrik:
1736
332
  case X86::VPTERNLOGDZ256rrik:
1737
332
  case X86::VPTERNLOGQZrrik:
1738
332
  case X86::VPTERNLOGQZ128rrik:
1739
332
  case X86::VPTERNLOGQZ256rrik:
1740
332
  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
1741
332
  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
1742
332
  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
1743
332
  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
1744
332
  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
1745
332
  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
1746
332
  case X86::VPTERNLOGDZ128rmbi:
1747
332
  case X86::VPTERNLOGDZ256rmbi:
1748
332
  case X86::VPTERNLOGDZrmbi:
1749
332
  case X86::VPTERNLOGQZ128rmbi:
1750
332
  case X86::VPTERNLOGQZ256rmbi:
1751
332
  case X86::VPTERNLOGQZrmbi:
1752
332
  case X86::VPTERNLOGDZ128rmbikz:
1753
332
  case X86::VPTERNLOGDZ256rmbikz:
1754
332
  case X86::VPTERNLOGDZrmbikz:
1755
332
  case X86::VPTERNLOGQZ128rmbikz:
1756
332
  case X86::VPTERNLOGQZ256rmbikz:
1757
332
  case X86::VPTERNLOGQZrmbikz: {
1758
332
    auto &WorkingMI = cloneIfNew(MI);
1759
332
    commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
1760
332
    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1761
332
                                                   OpIdx1, OpIdx2);
1762
332
  }
1763
405k
  default: {
1764
405k
    if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
1765
3.74k
      unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
1766
3.74k
      auto &WorkingMI = cloneIfNew(MI);
1767
3.74k
      WorkingMI.setDesc(get(Opc));
1768
3.74k
      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1769
3.74k
                                                     OpIdx1, OpIdx2);
1770
3.74k
    }
1771
401k
1772
401k
    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
1773
401k
                                                      MI.getDesc().TSFlags);
1774
401k
    if (FMA3Group) {
1775
6.97k
      unsigned Opc =
1776
6.97k
        getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
1777
6.97k
      auto &WorkingMI = cloneIfNew(MI);
1778
6.97k
      WorkingMI.setDesc(get(Opc));
1779
6.97k
      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
1780
6.97k
                                                     OpIdx1, OpIdx2);
1781
6.97k
    }
1782
394k
1783
394k
    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
1784
394k
  }
1785
445k
  }
1786
445k
}
1787
1788
bool
1789
X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
1790
                                            unsigned &SrcOpIdx1,
1791
                                            unsigned &SrcOpIdx2,
1792
16.0k
                                            bool IsIntrinsic) const {
1793
16.0k
  uint64_t TSFlags = MI.getDesc().TSFlags;
1794
16.0k
1795
16.0k
  unsigned FirstCommutableVecOp = 1;
1796
16.0k
  unsigned LastCommutableVecOp = 3;
1797
16.0k
  unsigned KMaskOp = -1U;
1798
16.0k
  if (X86II::isKMasked(TSFlags)) {
1799
6.61k
    // For k-zero-masked operations it is Ok to commute the first vector
1800
6.61k
    // operand.
1801
6.61k
    // For regular k-masked operations a conservative choice is done as the
1802
6.61k
    // elements of the first vector operand, for which the corresponding bit
1803
6.61k
    // in the k-mask operand is set to 0, are copied to the result of the
1804
6.61k
    // instruction.
1805
6.61k
    // TODO/FIXME: The commute still may be legal if it is known that the
1806
6.61k
    // k-mask operand is set to either all ones or all zeroes.
1807
6.61k
    // It is also Ok to commute the 1st operand if all users of MI use only
1808
6.61k
    // the elements enabled by the k-mask operand. For example,
1809
6.61k
    //   v4 = VFMADD213PSZrk v1, k, v2, v3; // v1[i] = k[i] ? v2[i]*v1[i]+v3[i]
1810
6.61k
    //                                                     : v1[i];
1811
6.61k
    //   VMOVAPSZmrk <mem_addr>, k, v4; // this is the ONLY user of v4 ->
1812
6.61k
    //                                  // Ok, to commute v1 in FMADD213PSZrk.
1813
6.61k
1814
6.61k
    // The k-mask operand has index = 2 for masked and zero-masked operations.
1815
6.61k
    KMaskOp = 2;
1816
6.61k
1817
6.61k
    // The operand with index = 1 is used as a source for those elements for
1818
6.61k
    // which the corresponding bit in the k-mask is set to 0.
1819
6.61k
    if (X86II::isKMergeMasked(TSFlags))
1820
4.69k
      FirstCommutableVecOp = 3;
1821
6.61k
1822
6.61k
    LastCommutableVecOp++;
1823
9.39k
  } else if (IsIntrinsic) {
1824
1.17k
    // Commuting the first operand of an intrinsic instruction isn't possible
1825
1.17k
    // unless we can prove that only the lowest element of the result is used.
1826
1.17k
    FirstCommutableVecOp = 2;
1827
1.17k
  }
1828
16.0k
1829
16.0k
  if (isMem(MI, LastCommutableVecOp))
1830
2.44k
    LastCommutableVecOp--;
1831
16.0k
1832
16.0k
  // Only the first RegOpsNum operands are commutable.
1833
16.0k
  // Also, the value 'CommuteAnyOperandIndex' is valid here as it means
1834
16.0k
  // that the operand is not specified/fixed.
1835
16.0k
  if (SrcOpIdx1 != CommuteAnyOperandIndex &&
1836
16.0k
      
(9.98k
SrcOpIdx1 < FirstCommutableVecOp9.98k
||
SrcOpIdx1 > LastCommutableVecOp6.33k
||
1837
9.98k
       
SrcOpIdx1 == KMaskOp6.31k
))
1838
3.67k
    return false;
1839
12.3k
  if (SrcOpIdx2 != CommuteAnyOperandIndex &&
1840
12.3k
      
(6.12k
SrcOpIdx2 < FirstCommutableVecOp6.12k
||
SrcOpIdx2 > LastCommutableVecOp6.12k
||
1841
6.12k
       
SrcOpIdx2 == KMaskOp4.94k
))
1842
1.56k
    return false;
1843
10.7k
1844
10.7k
  // Look for two different register operands assumed to be commutable
1845
10.7k
  // regardless of the FMA opcode. The FMA opcode is adjusted later.
1846
10.7k
  if (SrcOpIdx1 == CommuteAnyOperandIndex ||
1847
10.7k
      
SrcOpIdx2 == CommuteAnyOperandIndex4.74k
) {
1848
6.20k
    unsigned CommutableOpIdx2 = SrcOpIdx2;
1849
6.20k
1850
6.20k
    // At least one of operands to be commuted is not specified and
1851
6.20k
    // this method is free to choose appropriate commutable operands.
1852
6.20k
    if (SrcOpIdx1 == SrcOpIdx2)
1853
6.02k
      // Both of operands are not fixed. By default set one of commutable
1854
6.02k
      // operands to the last register operand of the instruction.
1855
6.02k
      CommutableOpIdx2 = LastCommutableVecOp;
1856
183
    else if (SrcOpIdx2 == CommuteAnyOperandIndex)
1857
183
      // Only one of operands is not fixed.
1858
183
      CommutableOpIdx2 = SrcOpIdx1;
1859
6.20k
1860
6.20k
    // CommutableOpIdx2 is well defined now. Let's choose another commutable
1861
6.20k
    // operand and assign its index to CommutableOpIdx1.
1862
6.20k
    unsigned Op2Reg = MI.getOperand(CommutableOpIdx2).getReg();
1863
6.20k
1864
6.20k
    unsigned CommutableOpIdx1;
1865
6.20k
    for (CommutableOpIdx1 = LastCommutableVecOp;
1866
13.4k
         CommutableOpIdx1 >= FirstCommutableVecOp; 
CommutableOpIdx1--7.28k
) {
1867
12.9k
      // Just ignore and skip the k-mask operand.
1868
12.9k
      if (CommutableOpIdx1 == KMaskOp)
1869
0
        continue;
1870
12.9k
1871
12.9k
      // The commuted operands must have different registers.
1872
12.9k
      // Otherwise, the commute transformation does not change anything and
1873
12.9k
      // is useless then.
1874
12.9k
      if (Op2Reg != MI.getOperand(CommutableOpIdx1).getReg())
1875
5.67k
        break;
1876
12.9k
    }
1877
6.20k
1878
6.20k
    // No appropriate commutable operands were found.
1879
6.20k
    if (CommutableOpIdx1 < FirstCommutableVecOp)
1880
531
      return false;
1881
5.67k
1882
5.67k
    // Assign the found pair of commutable indices to SrcOpIdx1 and SrcOpidx2
1883
5.67k
    // to return those values.
1884
5.67k
    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
1885
5.67k
                              CommutableOpIdx1, CommutableOpIdx2))
1886
0
      return false;
1887
10.2k
  }
1888
10.2k
1889
10.2k
  return true;
1890
10.2k
}
1891
1892
bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
1893
668k
                                         unsigned &SrcOpIdx2) const {
1894
668k
  const MCInstrDesc &Desc = MI.getDesc();
1895
668k
  if (!Desc.isCommutable())
1896
132k
    return false;
1897
535k
1898
535k
  switch (MI.getOpcode()) {
1899
535k
  case X86::CMPSDrr:
1900
6.49k
  case X86::CMPSSrr:
1901
6.49k
  case X86::CMPPDrri:
1902
6.49k
  case X86::CMPPSrri:
1903
6.49k
  case X86::VCMPSDrr:
1904
6.49k
  case X86::VCMPSSrr:
1905
6.49k
  case X86::VCMPPDrri:
1906
6.49k
  case X86::VCMPPSrri:
1907
6.49k
  case X86::VCMPPDYrri:
1908
6.49k
  case X86::VCMPPSYrri:
1909
6.49k
  case X86::VCMPSDZrr:
1910
6.49k
  case X86::VCMPSSZrr:
1911
6.49k
  case X86::VCMPPDZrri:
1912
6.49k
  case X86::VCMPPSZrri:
1913
6.49k
  case X86::VCMPPDZ128rri:
1914
6.49k
  case X86::VCMPPSZ128rri:
1915
6.49k
  case X86::VCMPPDZ256rri:
1916
6.49k
  case X86::VCMPPSZ256rri:
1917
6.49k
  case X86::VCMPPDZrrik:
1918
6.49k
  case X86::VCMPPSZrrik:
1919
6.49k
  case X86::VCMPPDZ128rrik:
1920
6.49k
  case X86::VCMPPSZ128rrik:
1921
6.49k
  case X86::VCMPPDZ256rrik:
1922
6.49k
  case X86::VCMPPSZ256rrik: {
1923
6.49k
    unsigned OpOffset = X86II::isKMasked(Desc.TSFlags) ? 
1173
:
06.32k
;
1924
6.49k
1925
6.49k
    // Float comparison can be safely commuted for
1926
6.49k
    // Ordered/Unordered/Equal/NotEqual tests
1927
6.49k
    unsigned Imm = MI.getOperand(3 + OpOffset).getImm() & 0x7;
1928
6.49k
    switch (Imm) {
1929
6.49k
    case 0x00: // EQUAL
1930
4.42k
    case 0x03: // UNORDERED
1931
4.42k
    case 0x04: // NOT EQUAL
1932
4.42k
    case 0x07: // ORDERED
1933
4.42k
      // The indices of the commutable operands are 1 and 2 (or 2 and 3
1934
4.42k
      // when masked).
1935
4.42k
      // Assign them to the returned operand indices here.
1936
4.42k
      return fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 1 + OpOffset,
1937
4.42k
                                  2 + OpOffset);
1938
2.07k
    }
1939
2.07k
    return false;
1940
2.07k
  }
1941
2.07k
  case X86::MOVSSrr:
1942
372
    // X86::MOVSDrr is always commutable. MOVSS is only commutable if we can
1943
372
    // form sse4.1 blend. We assume VMOVSSrr/VMOVSDrr is always commutable since
1944
372
    // AVX implies sse4.1.
1945
372
    if (Subtarget.hasSSE41())
1946
4
      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
1947
368
    return false;
1948
708
  case X86::SHUFPDrri:
1949
708
    // We can commute this to MOVSD.
1950
708
    if (MI.getOperand(3).getImm() == 0x02)
1951
622
      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
1952
86
    return false;
1953
2.92k
  case X86::MOVHLPSrr:
1954
2.92k
  case X86::UNPCKHPDrr:
1955
2.92k
  case X86::VMOVHLPSrr:
1956
2.92k
  case X86::VUNPCKHPDrr:
1957
2.92k
  case X86::VMOVHLPSZrr:
1958
2.92k
  case X86::VUNPCKHPDZ128rr:
1959
2.92k
    if (Subtarget.hasSSE2())
1960
2.80k
      return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
1961
117
    return false;
1962
2.41k
  case X86::VPTERNLOGDZrri:      case X86::VPTERNLOGDZrmi:
1963
2.41k
  case X86::VPTERNLOGDZ128rri:   case X86::VPTERNLOGDZ128rmi:
1964
2.41k
  case X86::VPTERNLOGDZ256rri:   case X86::VPTERNLOGDZ256rmi:
1965
2.41k
  case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
1966
2.41k
  case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
1967
2.41k
  case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
1968
2.41k
  case X86::VPTERNLOGDZrrik:
1969
2.41k
  case X86::VPTERNLOGDZ128rrik:
1970
2.41k
  case X86::VPTERNLOGDZ256rrik:
1971
2.41k
  case X86::VPTERNLOGQZrrik:
1972
2.41k
  case X86::VPTERNLOGQZ128rrik:
1973
2.41k
  case X86::VPTERNLOGQZ256rrik:
1974
2.41k
  case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
1975
2.41k
  case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
1976
2.41k
  case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
1977
2.41k
  case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
1978
2.41k
  case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
1979
2.41k
  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
1980
2.41k
  case X86::VPTERNLOGDZ128rmbi:
1981
2.41k
  case X86::VPTERNLOGDZ256rmbi:
1982
2.41k
  case X86::VPTERNLOGDZrmbi:
1983
2.41k
  case X86::VPTERNLOGQZ128rmbi:
1984
2.41k
  case X86::VPTERNLOGQZ256rmbi:
1985
2.41k
  case X86::VPTERNLOGQZrmbi:
1986
2.41k
  case X86::VPTERNLOGDZ128rmbikz:
1987
2.41k
  case X86::VPTERNLOGDZ256rmbikz:
1988
2.41k
  case X86::VPTERNLOGDZrmbikz:
1989
2.41k
  case X86::VPTERNLOGQZ128rmbikz:
1990
2.41k
  case X86::VPTERNLOGQZ256rmbikz:
1991
2.41k
  case X86::VPTERNLOGQZrmbikz:
1992
2.41k
    return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
1993
2.41k
  case X86::VPMADD52HUQZ128r:
1994
1.13k
  case X86::VPMADD52HUQZ128rk:
1995
1.13k
  case X86::VPMADD52HUQZ128rkz:
1996
1.13k
  case X86::VPMADD52HUQZ256r:
1997
1.13k
  case X86::VPMADD52HUQZ256rk:
1998
1.13k
  case X86::VPMADD52HUQZ256rkz:
1999
1.13k
  case X86::VPMADD52HUQZr:
2000
1.13k
  case X86::VPMADD52HUQZrk:
2001
1.13k
  case X86::VPMADD52HUQZrkz:
2002
1.13k
  case X86::VPMADD52LUQZ128r:
2003
1.13k
  case X86::VPMADD52LUQZ128rk:
2004
1.13k
  case X86::VPMADD52LUQZ128rkz:
2005
1.13k
  case X86::VPMADD52LUQZ256r:
2006
1.13k
  case X86::VPMADD52LUQZ256rk:
2007
1.13k
  case X86::VPMADD52LUQZ256rkz:
2008
1.13k
  case X86::VPMADD52LUQZr:
2009
1.13k
  case X86::VPMADD52LUQZrk:
2010
1.13k
  case X86::VPMADD52LUQZrkz: {
2011
1.13k
    unsigned CommutableOpIdx1 = 2;
2012
1.13k
    unsigned CommutableOpIdx2 = 3;
2013
1.13k
    if (X86II::isKMasked(Desc.TSFlags)) {
2014
880
      // Skip the mask register.
2015
880
      ++CommutableOpIdx1;
2016
880
      ++CommutableOpIdx2;
2017
880
    }
2018
1.13k
    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2019
1.13k
                              CommutableOpIdx1, CommutableOpIdx2))
2020
656
      return false;
2021
480
    if (!MI.getOperand(SrcOpIdx1).isReg() ||
2022
480
        !MI.getOperand(SrcOpIdx2).isReg())
2023
0
      // No idea.
2024
0
      return false;
2025
480
    return true;
2026
480
  }
2027
480
2028
521k
  default:
2029
521k
    const X86InstrFMA3Group *FMA3Group = getFMA3Group(MI.getOpcode(),
2030
521k
                                                      MI.getDesc().TSFlags);
2031
521k
    if (FMA3Group)
2032
13.5k
      return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2,
2033
13.5k
                                           FMA3Group->isIntrinsic());
2034
507k
2035
507k
    // Handled masked instructions since we need to skip over the mask input
2036
507k
    // and the preserved input.
2037
507k
    if (X86II::isKMasked(Desc.TSFlags)) {
2038
6.14k
      // First assume that the first input is the mask operand and skip past it.
2039
6.14k
      unsigned CommutableOpIdx1 = Desc.getNumDefs() + 1;
2040
6.14k
      unsigned CommutableOpIdx2 = Desc.getNumDefs() + 2;
2041
6.14k
      // Check if the first input is tied. If there isn't one then we only
2042
6.14k
      // need to skip the mask operand which we did above.
2043
6.14k
      if ((MI.getDesc().getOperandConstraint(Desc.getNumDefs(),
2044
6.14k
                                             MCOI::TIED_TO) != -1)) {
2045
4.08k
        // If this is zero masking instruction with a tied operand, we need to
2046
4.08k
        // move the first index back to the first input since this must
2047
4.08k
        // be a 3 input instruction and we want the first two non-mask inputs.
2048
4.08k
        // Otherwise this is a 2 input instruction with a preserved input and
2049
4.08k
        // mask, so we need to move the indices to skip one more input.
2050
4.08k
        if (X86II::isKMergeMasked(Desc.TSFlags)) {
2051
2.88k
          ++CommutableOpIdx1;
2052
2.88k
          ++CommutableOpIdx2;
2053
2.88k
        } else {
2054
1.20k
          --CommutableOpIdx1;
2055
1.20k
        }
2056
4.08k
      }
2057
6.14k
2058
6.14k
      if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
2059
6.14k
                                CommutableOpIdx1, CommutableOpIdx2))
2060
2.25k
        return false;
2061
3.89k
2062
3.89k
      if (!MI.getOperand(SrcOpIdx1).isReg() ||
2063
3.89k
          !MI.getOperand(SrcOpIdx2).isReg())
2064
0
        // No idea.
2065
0
        return false;
2066
3.89k
      return true;
2067
3.89k
    }
2068
501k
2069
501k
    return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
2070
0
  }
2071
0
  return false;
2072
0
}
2073
2074
6.13M
X86::CondCode X86::getCondFromBranch(const MachineInstr &MI) {
2075
6.13M
  switch (MI.getOpcode()) {
2076
6.13M
  
default: return X86::COND_INVALID1.02M
;
2077
6.13M
  case X86::JCC_1:
2078
5.10M
    return static_cast<X86::CondCode>(
2079
5.10M
        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2080
6.13M
  }
2081
6.13M
}
2082
2083
/// Return condition code of a SETCC opcode.
2084
604
X86::CondCode X86::getCondFromSETCC(const MachineInstr &MI) {
2085
604
  switch (MI.getOpcode()) {
2086
604
  
default: return X86::COND_INVALID415
;
2087
604
  
case X86::SETCCr: 189
case X86::SETCCm:
2088
189
    return static_cast<X86::CondCode>(
2089
189
        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2090
604
  }
2091
604
}
2092
2093
/// Return condition code of a CMov opcode.
2094
3.48M
X86::CondCode X86::getCondFromCMov(const MachineInstr &MI) {
2095
3.48M
  switch (MI.getOpcode()) {
2096
3.48M
  
default: return X86::COND_INVALID3.46M
;
2097
3.48M
  
case X86::CMOV16rr: 18.8k
case X86::CMOV32rr: 18.8k
case X86::CMOV64rr:
2098
18.8k
  case X86::CMOV16rm: case X86::CMOV32rm: case X86::CMOV64rm:
2099
18.8k
    return static_cast<X86::CondCode>(
2100
18.8k
        MI.getOperand(MI.getDesc().getNumOperands() - 1).getImm());
2101
3.48M
  }
2102
3.48M
}
2103
2104
/// Return the inverse of the specified condition,
2105
/// e.g. turning COND_E to COND_NE.
2106
752k
X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
2107
752k
  switch (CC) {
2108
752k
  
default: 0
llvm_unreachable0
("Illegal condition code!");
2109
752k
  
case X86::COND_E: return X86::COND_NE266k
;
2110
752k
  
case X86::COND_NE: return X86::COND_E265k
;
2111
752k
  
case X86::COND_L: return X86::COND_GE8.53k
;
2112
752k
  
case X86::COND_LE: return X86::COND_G4.59k
;
2113
752k
  
case X86::COND_G: return X86::COND_LE4.98k
;
2114
752k
  
case X86::COND_GE: return X86::COND_L8.50k
;
2115
752k
  
case X86::COND_B: return X86::COND_AE67.5k
;
2116
752k
  
case X86::COND_BE: return X86::COND_A27.3k
;
2117
752k
  
case X86::COND_A: return X86::COND_BE30.1k
;
2118
752k
  
case X86::COND_AE: return X86::COND_B56.9k
;
2119
752k
  
case X86::COND_S: return X86::COND_NS4.60k
;
2120
752k
  
case X86::COND_NS: return X86::COND_S4.92k
;
2121
752k
  
case X86::COND_P: return X86::COND_NP999
;
2122
752k
  
case X86::COND_NP: return X86::COND_P824
;
2123
752k
  
case X86::COND_O: return X86::COND_NO374
;
2124
752k
  
case X86::COND_NO: return X86::COND_O179
;
2125
752k
  
case X86::COND_NE_OR_P: return X86::COND_E_AND_NP167
;
2126
752k
  
case X86::COND_E_AND_NP: return X86::COND_NE_OR_P169
;
2127
752k
  }
2128
752k
}
2129
2130
/// Assuming the flags are set by MI(a,b), return the condition code if we
2131
/// modify the instructions such that flags are set by MI(b,a).
2132
static X86::CondCode getSwappedCondition(X86::CondCode CC) {
2133
  switch (CC) {
2134
  default: return X86::COND_INVALID;
2135
  case X86::COND_E:  return X86::COND_E;
2136
  case X86::COND_NE: return X86::COND_NE;
2137
  case X86::COND_L:  return X86::COND_G;
2138
  case X86::COND_LE: return X86::COND_GE;
2139
  case X86::COND_G:  return X86::COND_L;
2140
  case X86::COND_GE: return X86::COND_LE;
2141
  case X86::COND_B:  return X86::COND_A;
2142
  case X86::COND_BE: return X86::COND_AE;
2143
  case X86::COND_A:  return X86::COND_B;
2144
  case X86::COND_AE: return X86::COND_BE;
2145
  }
2146
}
2147
2148
std::pair<X86::CondCode, bool>
2149
639
X86::getX86ConditionCode(CmpInst::Predicate Predicate) {
2150
639
  X86::CondCode CC = X86::COND_INVALID;
2151
639
  bool NeedSwap = false;
2152
639
  switch (Predicate) {
2153
639
  
default: break0
;
2154
639
  // Floating-point Predicates
2155
639
  
case CmpInst::FCMP_UEQ: CC = X86::COND_E; break13
;
2156
639
  
case CmpInst::FCMP_OLT: NeedSwap = true; 21
LLVM_FALLTHROUGH21
;
2157
34
  case CmpInst::FCMP_OGT: CC = X86::COND_A;       break;
2158
21
  case CmpInst::FCMP_OLE: NeedSwap = true;        LLVM_FALLTHROUGH;
2159
34
  case CmpInst::FCMP_OGE: CC = X86::COND_AE;      break;
2160
21
  case CmpInst::FCMP_UGT: NeedSwap = true;        LLVM_FALLTHROUGH;
2161
34
  case CmpInst::FCMP_ULT: CC = X86::COND_B;       break;
2162
27
  case CmpInst::FCMP_UGE: NeedSwap = true;        LLVM_FALLTHROUGH;
2163
42
  case CmpInst::FCMP_ULE: CC = X86::COND_BE;      break;
2164
27
  
case CmpInst::FCMP_ONE: CC = X86::COND_NE; break24
;
2165
33
  case CmpInst::FCMP_UNO: CC = X86::COND_P;       break;
2166
33
  case CmpInst::FCMP_ORD: CC = X86::COND_NP;      break;
2167
27
  
case CmpInst::FCMP_OEQ: 0
LLVM_FALLTHROUGH0
;
2168
0
  case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break;
2169
0
2170
0
  // Integer Predicates
2171
107
  case CmpInst::ICMP_EQ:  CC = X86::COND_E;       break;
2172
106
  case CmpInst::ICMP_NE:  CC = X86::COND_NE;      break;
2173
12
  case CmpInst::ICMP_UGT: CC = X86::COND_A;       break;
2174
14
  case CmpInst::ICMP_UGE: CC = X86::COND_AE;      break;
2175
19
  case CmpInst::ICMP_ULT: CC = X86::COND_B;       break;
2176
13
  case CmpInst::ICMP_ULE: CC = X86::COND_BE;      break;
2177
40
  case CmpInst::ICMP_SGT: CC = X86::COND_G;       break;
2178
24
  case CmpInst::ICMP_SGE: CC = X86::COND_GE;      break;
2179
25
  case CmpInst::ICMP_SLT: CC = X86::COND_L;       break;
2180
32
  case CmpInst::ICMP_SLE: CC = X86::COND_LE;      break;
2181
639
  }
2182
639
2183
639
  return std::make_pair(CC, NeedSwap);
2184
639
}
2185
2186
/// Return a setcc opcode based on whether it has memory operand.
2187
0
unsigned X86::getSETOpc(bool HasMemoryOperand) {
2188
0
  return HasMemoryOperand ? X86::SETCCr : X86::SETCCm;
2189
0
}
2190
2191
/// Return a cmov opcode for the given register size in bytes, and operand type.
2192
259
unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
2193
259
  switch(RegBytes) {
2194
259
  
default: 0
llvm_unreachable0
("Illegal register size!");
2195
259
  
case 2: return HasMemoryOperand 4
?
X86::CMOV16rm0
:
X86::CMOV16rr4
;
2196
259
  
case 4: return HasMemoryOperand 34
?
X86::CMOV32rm0
:
X86::CMOV32rr34
;
2197
259
  
case 8: return HasMemoryOperand 221
?
X86::CMOV32rm0
:
X86::CMOV64rr221
;
2198
259
  }
2199
259
}
2200
2201
/// Get the VPCMP immediate for the given condition.
2202
1.37k
unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
2203
1.37k
  switch (CC) {
2204
1.37k
  
default: 0
llvm_unreachable0
("Unexpected SETCC condition");
2205
1.37k
  
case ISD::SETNE: return 4156
;
2206
1.37k
  
case ISD::SETEQ: return 00
;
2207
1.37k
  case ISD::SETULT:
2208
373
  case ISD::SETLT: return 1;
2209
373
  case ISD::SETUGT:
2210
154
  case ISD::SETGT: return 6;
2211
529
  case ISD::SETUGE:
2212
529
  case ISD::SETGE: return 5;
2213
529
  case ISD::SETULE:
2214
158
  case ISD::SETLE: return 2;
2215
1.37k
  }
2216
1.37k
}
2217
2218
/// Get the VPCMP immediate if the opcodes are swapped.
2219
2.16k
unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
2220
2.16k
  switch (Imm) {
2221
2.16k
  
default: 0
llvm_unreachable0
("Unreachable!");
2222
2.16k
  
case 0x01: Imm = 0x06; break400
; // LT -> NLE
2223
2.16k
  
case 0x02: Imm = 0x05; break525
; // LE -> NLT
2224
2.16k
  
case 0x05: Imm = 0x02; break533
; // NLT -> LE
2225
2.16k
  
case 0x06: Imm = 0x01; break401
; // NLE -> LT
2226
2.16k
  case 0x00: // EQ
2227
309
  case 0x03: // FALSE
2228
309
  case 0x04: // NE
2229
309
  case 0x07: // TRUE
2230
309
    break;
2231
2.16k
  }
2232
2.16k
2233
2.16k
  return Imm;
2234
2.16k
}
2235
2236
/// Get the VPCOM immediate if the opcodes are swapped.
2237
808
unsigned X86::getSwappedVPCOMImm(unsigned Imm) {
2238
808
  switch (Imm) {
2239
808
  
default: 0
llvm_unreachable0
("Unreachable!");
2240
808
  
case 0x00: Imm = 0x02; break237
; // LT -> GT
2241
808
  
case 0x01: Imm = 0x03; break50
; // LE -> GE
2242
808
  
case 0x02: Imm = 0x00; break237
; // GT -> LT
2243
808
  
case 0x03: Imm = 0x01; break50
; // GE -> LE
2244
808
  case 0x04: // EQ
2245
234
  case 0x05: // NE
2246
234
  case 0x06: // FALSE
2247
234
  case 0x07: // TRUE
2248
234
    break;
2249
808
  }
2250
808
2251
808
  return Imm;
2252
808
}
2253
2254
12.6M
bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
2255
12.6M
  if (!MI.isTerminator()) 
return false6.29M
;
2256
6.36M
2257
6.36M
  // Conditional branch is a special case.
2258
6.36M
  if (MI.isBranch() && 
!MI.isBarrier()5.58M
)
2259
4.27M
    return true;
2260
2.09M
  if (!MI.isPredicable())
2261
2.09M
    return true;
2262
18.4E
  return !isPredicated(MI);
2263
18.4E
}
2264
2265
5.26k
bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
2266
5.26k
  switch (MI.getOpcode()) {
2267
5.26k
  case X86::TCRETURNdi:
2268
45
  case X86::TCRETURNri:
2269
45
  case X86::TCRETURNmi:
2270
45
  case X86::TCRETURNdi64:
2271
45
  case X86::TCRETURNri64:
2272
45
  case X86::TCRETURNmi64:
2273
45
    return true;
2274
5.21k
  default:
2275
5.21k
    return false;
2276
5.26k
  }
2277
5.26k
}
2278
2279
bool X86InstrInfo::canMakeTailCallConditional(
2280
    SmallVectorImpl<MachineOperand> &BranchCond,
2281
30
    const MachineInstr &TailCall) const {
2282
30
  if (TailCall.getOpcode() != X86::TCRETURNdi &&
2283
30
      
TailCall.getOpcode() != X86::TCRETURNdi6427
) {
2284
0
    // Only direct calls can be done with a conditional branch.
2285
0
    return false;
2286
0
  }
2287
30
2288
30
  const MachineFunction *MF = TailCall.getParent()->getParent();
2289
30
  if (Subtarget.isTargetWin64() && 
MF->hasWinCFI()3
) {
2290
0
    // Conditional tail calls confuse the Win64 unwinder.
2291
0
    return false;
2292
0
  }
2293
30
2294
30
  assert(BranchCond.size() == 1);
2295
30
  if (BranchCond[0].getImm() > X86::LAST_VALID_COND) {
2296
0
    // Can't make a conditional tail call with this condition.
2297
0
    return false;
2298
0
  }
2299
30
2300
30
  const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
2301
30
  if (X86FI->getTCReturnAddrDelta() != 0 ||
2302
30
      TailCall.getOperand(1).getImm() != 0) {
2303
0
    // A conditional tail call cannot do any stack adjustment.
2304
0
    return false;
2305
0
  }
2306
30
2307
30
  return true;
2308
30
}
2309
2310
void X86InstrInfo::replaceBranchWithTailCall(
2311
    MachineBasicBlock &MBB, SmallVectorImpl<MachineOperand> &BranchCond,
2312
30
    const MachineInstr &TailCall) const {
2313
30
  assert(canMakeTailCallConditional(BranchCond, TailCall));
2314
30
2315
30
  MachineBasicBlock::iterator I = MBB.end();
2316
30
  while (I != MBB.begin()) {
2317
30
    --I;
2318
30
    if (I->isDebugInstr())
2319
0
      continue;
2320
30
    if (!I->isBranch())
2321
30
      assert(0 && "Can't find the branch to replace!");
2322
30
2323
30
    X86::CondCode CC = X86::getCondFromBranch(*I);
2324
30
    assert(BranchCond.size() == 1);
2325
30
    if (CC != BranchCond[0].getImm())
2326
0
      continue;
2327
30
2328
30
    break;
2329
30
  }
2330
30
2331
30
  unsigned Opc = TailCall.getOpcode() == X86::TCRETURNdi ? 
X86::TCRETURNdicc3
2332
30
                                                         : 
X86::TCRETURNdi64cc27
;
2333
30
2334
30
  auto MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opc));
2335
30
  MIB->addOperand(TailCall.getOperand(0)); // Destination.
2336
30
  MIB.addImm(0); // Stack offset (not used).
2337
30
  MIB->addOperand(BranchCond[0]); // Condition.
2338
30
  MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
2339
30
2340
30
  // Add implicit uses and defs of all live regs potentially clobbered by the
2341
30
  // call. This way they still appear live across the call.
2342
30
  LivePhysRegs LiveRegs(getRegisterInfo());
2343
30
  LiveRegs.addLiveOuts(MBB);
2344
30
  SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 8> Clobbers;
2345
30
  LiveRegs.stepForward(*MIB, Clobbers);
2346
115
  for (const auto &C : Clobbers) {
2347
115
    MIB.addReg(C.first, RegState::Implicit);
2348
115
    MIB.addReg(C.first, RegState::Implicit | RegState::Define);
2349
115
  }
2350
30
2351
30
  I->eraseFromParent();
2352
30
}
2353
2354
// Given a MBB and its TBB, find the FBB which was a fallthrough MBB (it may
2355
// not be a fallthrough MBB now due to layout changes). Return nullptr if the
2356
// fallthrough MBB cannot be identified.
2357
static MachineBasicBlock *getFallThroughMBB(MachineBasicBlock *MBB,
2358
1.70k
                                            MachineBasicBlock *TBB) {
2359
1.70k
  // Look for non-EHPad successors other than TBB. If we find exactly one, it
2360
1.70k
  // is the fallthrough MBB. If we find zero, then TBB is both the target MBB
2361
1.70k
  // and fallthrough MBB. If we find more than one, we cannot identify the
2362
1.70k
  // fallthrough MBB and should return nullptr.
2363
1.70k
  MachineBasicBlock *FallthroughBB = nullptr;
2364
5.09k
  for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; 
++SI3.39k
) {
2365
3.39k
    if ((*SI)->isEHPad() || (*SI == TBB && 
FallthroughBB1.70k
))
2366
1.30k
      continue;
2367
2.09k
    // Return a nullptr if we found more than one fallthrough successor.
2368
2.09k
    if (FallthroughBB && 
FallthroughBB != TBB398
)
2369
0
      return nullptr;
2370
2.09k
    FallthroughBB = *SI;
2371
2.09k
  }
2372
1.70k
  return FallthroughBB;
2373
1.70k
}
2374
2375
bool X86InstrInfo::AnalyzeBranchImpl(
2376
    MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
2377
    SmallVectorImpl<MachineOperand> &Cond,
2378
7.22M
    SmallVectorImpl<MachineInstr *> &CondBranches, bool AllowModify) const {
2379
7.22M
2380
7.22M
  // Start from the bottom of the block and work up, examining the
2381
7.22M
  // terminator instructions.
2382
7.22M
  MachineBasicBlock::iterator I = MBB.end();
2383
7.22M
  MachineBasicBlock::iterator UnCondBrIter = MBB.end();
2384
12.6M
  while (I != MBB.begin()) {
2385
12.5M
    --I;
2386
12.5M
    if (I->isDebugInstr())
2387
714
      continue;
2388
12.5M
2389
12.5M
    // Working from the bottom, when we see a non-terminator instruction, we're
2390
12.5M
    // done.
2391
12.5M
    if (!isUnpredicatedTerminator(*I))
2392
6.29M
      break;
2393
6.28M
2394
6.28M
    // A terminator that isn't a branch can't easily be handled by this
2395
6.28M
    // analysis.
2396
6.28M
    if (!I->isBranch())
2397
785k
      return true;
2398
5.49M
2399
5.49M
    // Handle unconditional branches.
2400
5.49M
    if (I->getOpcode() == X86::JMP_1) {
2401
1.27M
      UnCondBrIter = I;
2402
1.27M
2403
1.27M
      if (!AllowModify) {
2404
471k
        TBB = I->getOperand(0).getMBB();
2405
471k
        continue;
2406
471k
      }
2407
805k
2408
805k
      // If the block has any instructions after a JMP, delete them.
2409
805k
      while (std::next(I) != MBB.end())
2410
0
        std::next(I)->eraseFromParent();
2411
805k
2412
805k
      Cond.clear();
2413
805k
      FBB = nullptr;
2414
805k
2415
805k
      // Delete the JMP if it's equivalent to a fall-through.
2416
805k
      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
2417
147k
        TBB = nullptr;
2418
147k
        I->eraseFromParent();
2419
147k
        I = MBB.end();
2420
147k
        UnCondBrIter = MBB.end();
2421
147k
        continue;
2422
147k
      }
2423
658k
2424
658k
      // TBB is used to indicate the unconditional destination.
2425
658k
      TBB = I->getOperand(0).getMBB();
2426
658k
      continue;
2427
658k
    }
2428
4.22M
2429
4.22M
    // Handle conditional branches.
2430
4.22M
    X86::CondCode BranchCode = X86::getCondFromBranch(*I);
2431
4.22M
    if (BranchCode == X86::COND_INVALID)
2432
30.2k
      return true;  // Can't handle indirect branch.
2433
4.19M
2434
4.19M
    // In practice we should never have an undef eflags operand, if we do
2435
4.19M
    // abort here as we are not prepared to preserve the flag.
2436
4.19M
    if (I->findRegisterUseOperand(X86::EFLAGS)->isUndef())
2437
48
      return true;
2438
4.19M
2439
4.19M
    // Working from the bottom, handle the first conditional branch.
2440
4.19M
    if (Cond.empty()) {
2441
4.18M
      MachineBasicBlock *TargetBB = I->getOperand(0).getMBB();
2442
4.18M
      if (AllowModify && 
UnCondBrIter != MBB.end()2.30M
&&
2443
4.18M
          
MBB.isLayoutSuccessor(TargetBB)198k
) {
2444
18.0k
        // If we can modify the code and it ends in something like:
2445
18.0k
        //
2446
18.0k
        //     jCC L1
2447
18.0k
        //     jmp L2
2448
18.0k
        //   L1:
2449
18.0k
        //     ...
2450
18.0k
        //   L2:
2451
18.0k
        //
2452
18.0k
        // Then we can change this to:
2453
18.0k
        //
2454
18.0k
        //     jnCC L2
2455
18.0k
        //   L1:
2456
18.0k
        //     ...
2457
18.0k
        //   L2:
2458
18.0k
        //
2459
18.0k
        // Which is a bit more efficient.
2460
18.0k
        // We conditionally jump to the fall-through block.
2461
18.0k
        BranchCode = GetOppositeBranchCondition(BranchCode);
2462
18.0k
        MachineBasicBlock::iterator OldInst = I;
2463
18.0k
2464
18.0k
        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JCC_1))
2465
18.0k
          .addMBB(UnCondBrIter->getOperand(0).getMBB())
2466
18.0k
          .addImm(BranchCode);
2467
18.0k
        BuildMI(MBB, UnCondBrIter, MBB.findDebugLoc(I), get(X86::JMP_1))
2468
18.0k
          .addMBB(TargetBB);
2469
18.0k
2470
18.0k
        OldInst->eraseFromParent();
2471
18.0k
        UnCondBrIter->eraseFromParent();
2472
18.0k
2473
18.0k
        // Restart the analysis.
2474
18.0k
        UnCondBrIter = MBB.end();
2475
18.0k
        I = MBB.end();
2476
18.0k
        continue;
2477
18.0k
      }
2478
4.16M
2479
4.16M
      FBB = TBB;
2480
4.16M
      TBB = I->getOperand(0).getMBB();
2481
4.16M
      Cond.push_back(MachineOperand::CreateImm(BranchCode));
2482
4.16M
      CondBranches.push_back(&*I);
2483
4.16M
      continue;
2484
4.16M
    }
2485
7.57k
2486
7.57k
    // Handle subsequent conditional branches. Only handle the case where all
2487
7.57k
    // conditional branches branch to the same destination and their condition
2488
7.57k
    // opcodes fit one of the special multi-branch idioms.
2489
7.57k
    assert(Cond.size() == 1);
2490
7.57k
    assert(TBB);
2491
7.57k
2492
7.57k
    // If the conditions are the same, we can leave them alone.
2493
7.57k
    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
2494
7.57k
    auto NewTBB = I->getOperand(0).getMBB();
2495
7.57k
    if (OldBranchCode == BranchCode && 
TBB == NewTBB0
)
2496
0
      continue;
2497
7.57k
2498
7.57k
    // If they differ, see if they fit one of the known patterns. Theoretically,
2499
7.57k
    // we could handle more patterns here, but we shouldn't expect to see them
2500
7.57k
    // if instruction selection has done a reasonable job.
2501
7.57k
    if (TBB == NewTBB &&
2502
7.57k
               
(1.47k
(1.47k
OldBranchCode == X86::COND_P1.47k
&&
BranchCode == X86::COND_NE1.47k
) ||
2503
1.47k
                
(8
OldBranchCode == X86::COND_NE8
&&
BranchCode == X86::COND_P0
))) {
2504
1.47k
      BranchCode = X86::COND_NE_OR_P;
2505
6.10k
    } else if ((OldBranchCode == X86::COND_NP && 
BranchCode == X86::COND_NE1.55k
) ||
2506
6.10k
               
(4.53k
OldBranchCode == X86::COND_E4.53k
&&
BranchCode == X86::COND_P0
)) {
2507
1.55k
      if (NewTBB != (FBB ? 
FBB19
:
getFallThroughMBB(&MBB, TBB)1.53k
))
2508
0
        return true;
2509
1.55k
2510
1.55k
      // X86::COND_E_AND_NP usually has two different branch destinations.
2511
1.55k
      //
2512
1.55k
      // JP B1
2513
1.55k
      // JE B2
2514
1.55k
      // JMP B1
2515
1.55k
      // B1:
2516
1.55k
      // B2:
2517
1.55k
      //
2518
1.55k
      // Here this condition branches to B2 only if NP && E. It has another
2519
1.55k
      // equivalent form:
2520
1.55k
      //
2521
1.55k
      // JNE B1
2522
1.55k
      // JNP B2
2523
1.55k
      // JMP B1
2524
1.55k
      // B1:
2525
1.55k
      // B2:
2526
1.55k
      //
2527
1.55k
      // Similarly it branches to B2 only if E && NP. That is why this condition
2528
1.55k
      // is named with COND_E_AND_NP.
2529
1.55k
      BranchCode = X86::COND_E_AND_NP;
2530
1.55k
    } else
2531
4.55k
      return true;
2532
3.02k
2533
3.02k
    // Update the MachineOperand.
2534
3.02k
    Cond[0].setImm(BranchCode);
2535
3.02k
    CondBranches.push_back(&*I);
2536
3.02k
  }
2537
7.22M
2538
7.22M
  
return false6.40M
;
2539
7.22M
}
2540
2541
bool X86InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
2542
                                 MachineBasicBlock *&TBB,
2543
                                 MachineBasicBlock *&FBB,
2544
                                 SmallVectorImpl<MachineOperand> &Cond,
2545
7.22M
                                 bool AllowModify) const {
2546
7.22M
  SmallVector<MachineInstr *, 4> CondBranches;
2547
7.22M
  return AnalyzeBranchImpl(MBB, TBB, FBB, Cond, CondBranches, AllowModify);
2548
7.22M
}
2549
2550
bool X86InstrInfo::analyzeBranchPredicate(MachineBasicBlock &MBB,
2551
                                          MachineBranchPredicate &MBP,
2552
60
                                          bool AllowModify) const {
2553
60
  using namespace std::placeholders;
2554
60
2555
60
  SmallVector<MachineOperand, 4> Cond;
2556
60
  SmallVector<MachineInstr *, 4> CondBranches;
2557
60
  if (AnalyzeBranchImpl(MBB, MBP.TrueDest, MBP.FalseDest, Cond, CondBranches,
2558
60
                        AllowModify))
2559
0
    return true;
2560
60
2561
60
  if (Cond.size() != 1)
2562
0
    return true;
2563
60
2564
60
  assert(MBP.TrueDest && "expected!");
2565
60
2566
60
  if (!MBP.FalseDest)
2567
58
    MBP.FalseDest = MBB.getNextNode();
2568
60
2569
60
  const TargetRegisterInfo *TRI = &getRegisterInfo();
2570
60
2571
60
  MachineInstr *ConditionDef = nullptr;
2572
60
  bool SingleUseCondition = true;
2573
60
2574
63
  for (auto I = std::next(MBB.rbegin()), E = MBB.rend(); I != E; 
++I3
) {
2575
63
    if (I->modifiesRegister(X86::EFLAGS, TRI)) {
2576
60
      ConditionDef = &*I;
2577
60
      break;
2578
60
    }
2579
3
2580
3
    if (I->readsRegister(X86::EFLAGS, TRI))
2581
2
      SingleUseCondition = false;
2582
3
  }
2583
60
2584
60
  if (!ConditionDef)
2585
0
    return true;
2586
60
2587
60
  if (SingleUseCondition) {
2588
58
    for (auto *Succ : MBB.successors())
2589
116
      if (Succ->isLiveIn(X86::EFLAGS))
2590
0
        SingleUseCondition = false;
2591
58
  }
2592
60
2593
60
  MBP.ConditionDef = ConditionDef;
2594
60
  MBP.SingleUseCondition = SingleUseCondition;
2595
60
2596
60
  // Currently we only recognize the simple pattern:
2597
60
  //
2598
60
  //   test %reg, %reg
2599
60
  //   je %label
2600
60
  //
2601
60
  const unsigned TestOpcode =
2602
60
      Subtarget.is64Bit() ? X86::TEST64rr : 
X86::TEST32rr0
;
2603
60
2604
60
  if (ConditionDef->getOpcode() == TestOpcode &&
2605
60
      ConditionDef->getNumOperands() == 3 &&
2606
60
      ConditionDef->getOperand(0).isIdenticalTo(ConditionDef->getOperand(1)) &&
2607
60
      (Cond[0].getImm() == X86::COND_NE || 
Cond[0].getImm() == X86::COND_E34
)) {
2608
60
    MBP.LHS = ConditionDef->getOperand(0);
2609
60
    MBP.RHS = MachineOperand::CreateImm(0);
2610
60
    MBP.Predicate = Cond[0].getImm() == X86::COND_NE
2611
60
                        ? 
MachineBranchPredicate::PRED_NE26
2612
60
                        : 
MachineBranchPredicate::PRED_EQ34
;
2613
60
    return false;
2614
60
  }
2615
0
2616
0
  return true;
2617
0
}
2618
2619
unsigned X86InstrInfo::removeBranch(MachineBasicBlock &MBB,
2620
917k
                                    int *BytesRemoved) const {
2621
917k
  assert(!BytesRemoved && "code size not handled");
2622
917k
2623
917k
  MachineBasicBlock::iterator I = MBB.end();
2624
917k
  unsigned Count = 0;
2625
917k
2626
1.90M
  while (I != MBB.begin()) {
2627
1.86M
    --I;
2628
1.86M
    if (I->isDebugInstr())
2629
45
      continue;
2630
1.86M
    if (I->getOpcode() != X86::JMP_1 &&
2631
1.86M
        
X86::getCondFromBranch(*I) == X86::COND_INVALID1.62M
)
2632
878k
      break;
2633
984k
    // Remove the branch.
2634
984k
    I->eraseFromParent();
2635
984k
    I = MBB.end();
2636
984k
    ++Count;
2637
984k
  }
2638
917k
2639
917k
  return Count;
2640
917k
}
2641
2642
unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
2643
                                    MachineBasicBlock *TBB,
2644
                                    MachineBasicBlock *FBB,
2645
                                    ArrayRef<MachineOperand> Cond,
2646
                                    const DebugLoc &DL,
2647
961k
                                    int *BytesAdded) const {
2648
961k
  // Shouldn't be a fall through.
2649
961k
  assert(TBB && "insertBranch must not be told to insert a fallthrough");
2650
961k
  assert((Cond.size() == 1 || Cond.size() == 0) &&
2651
961k
         "X86 branch conditions have one component!");
2652
961k
  assert(!BytesAdded && "code size not handled");
2653
961k
2654
961k
  if (Cond.empty()) {
2655
215k
    // Unconditional branch?
2656
215k
    assert(!FBB && "Unconditional branch with multiple successors!");
2657
215k
    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(TBB);
2658
215k
    return 1;
2659
215k
  }
2660
745k
2661
745k
  // If FBB is null, it is implied to be a fall-through block.
2662
745k
  bool FallThru = FBB == nullptr;
2663
745k
2664
745k
  // Conditional branch.
2665
745k
  unsigned Count = 0;
2666
745k
  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
2667
745k
  switch (CC) {
2668
745k
  case X86::COND_NE_OR_P:
2669
194
    // Synthesize NE_OR_P with two branches.
2670
194
    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NE);
2671
194
    ++Count;
2672
194
    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_P);
2673
194
    ++Count;
2674
194
    break;
2675
745k
  case X86::COND_E_AND_NP:
2676
170
    // Use the next block of MBB as FBB if it is null.
2677
170
    if (FBB == nullptr) {
2678
168
      FBB = getFallThroughMBB(&MBB, TBB);
2679
168
      assert(FBB && "MBB cannot be the last block in function when the false "
2680
168
                    "body is a fall-through.");
2681
168
    }
2682
170
    // Synthesize COND_E_AND_NP with two branches.
2683
170
    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(FBB).addImm(X86::COND_NE);
2684
170
    ++Count;
2685
170
    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(X86::COND_NP);
2686
170
    ++Count;
2687
170
    break;
2688
745k
  default: {
2689
745k
    BuildMI(&MBB, DL, get(X86::JCC_1)).addMBB(TBB).addImm(CC);
2690
745k
    ++Count;
2691
745k
  }
2692
745k
  }
2693
745k
  if (!FallThru) {
2694
33.8k
    // Two-way Conditional branch. Insert the second branch.
2695
33.8k
    BuildMI(&MBB, DL, get(X86::JMP_1)).addMBB(FBB);
2696
33.8k
    ++Count;
2697
33.8k
  }
2698
745k
  return Count;
2699
745k
}
2700
2701
bool X86InstrInfo::
2702
canInsertSelect(const MachineBasicBlock &MBB,
2703
                ArrayRef<MachineOperand> Cond,
2704
                unsigned TrueReg, unsigned FalseReg,
2705
7
                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
2706
7
  // Not all subtargets have cmov instructions.
2707
7
  if (!Subtarget.hasCMov())
2708
0
    return false;
2709
7
  if (Cond.size() != 1)
2710
0
    return false;
2711
7
  // We cannot do the composite conditions, at least not in SSA form.
2712
7
  if ((X86::CondCode)Cond[0].getImm() > X86::LAST_VALID_COND)
2713
0
    return false;
2714
7
2715
7
  // Check register classes.
2716
7
  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2717
7
  const TargetRegisterClass *RC =
2718
7
    RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
2719
7
  if (!RC)
2720
0
    return false;
2721
7
2722
7
  // We have cmov instructions for 16, 32, and 64 bit general purpose registers.
2723
7
  if (X86::GR16RegClass.hasSubClassEq(RC) ||
2724
7
      X86::GR32RegClass.hasSubClassEq(RC) ||
2725
7
      
X86::GR64RegClass.hasSubClassEq(RC)0
) {
2726
7
    // This latency applies to Pentium M, Merom, Wolfdale, Nehalem, and Sandy
2727
7
    // Bridge. Probably Ivy Bridge as well.
2728
7
    CondCycles = 2;
2729
7
    TrueCycles = 2;
2730
7
    FalseCycles = 2;
2731
7
    return true;
2732
7
  }
2733
0
2734
0
  // Can't do vectors.
2735
0
  return false;
2736
0
}
2737
2738
void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
2739
                                MachineBasicBlock::iterator I,
2740
                                const DebugLoc &DL, unsigned DstReg,
2741
                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
2742
4
                                unsigned FalseReg) const {
2743
4
  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
2744
4
  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
2745
4
  const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
2746
4
  assert(Cond.size() == 1 && "Invalid Cond array");
2747
4
  unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(RC) / 8,
2748
4
                                    false /*HasMemoryOperand*/);
2749
4
  BuildMI(MBB, I, DL, get(Opc), DstReg)
2750
4
      .addReg(FalseReg)
2751
4
      .addReg(TrueReg)
2752
4
      .addImm(Cond[0].getImm());
2753
4
}
2754
2755
/// Test if the given register is a physical h register.
2756
7.20k
static bool isHReg(unsigned Reg) {
2757
7.20k
  return X86::GR8_ABCD_HRegClass.contains(Reg);
2758
7.20k
}
2759
2760
// Try and copy between VR128/VR64 and GR64 registers.
2761
static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
2762
13.8k
                                        const X86Subtarget &Subtarget) {
2763
13.8k
  bool HasAVX = Subtarget.hasAVX();
2764
13.8k
  bool HasAVX512 = Subtarget.hasAVX512();
2765
13.8k
2766
13.8k
  // SrcReg(MaskReg) -> DestReg(GR64)
2767
13.8k
  // SrcReg(MaskReg) -> DestReg(GR32)
2768
13.8k
2769
13.8k
  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
2770
13.8k
  if (X86::VK16RegClass.contains(SrcReg)) {
2771
4.69k
    if (X86::GR64RegClass.contains(DestReg)) {
2772
330
      assert(Subtarget.hasBWI());
2773
330
      return X86::KMOVQrk;
2774
330
    }
2775
4.36k
    if (X86::GR32RegClass.contains(DestReg))
2776
4.36k
      return Subtarget.hasBWI() ? 
X86::KMOVDrk1.45k
:
X86::KMOVWrk2.91k
;
2777
9.11k
  }
2778
9.11k
2779
9.11k
  // SrcReg(GR64) -> DestReg(MaskReg)
2780
9.11k
  // SrcReg(GR32) -> DestReg(MaskReg)
2781
9.11k
2782
9.11k
  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
2783
9.11k
  if (X86::VK16RegClass.contains(DestReg)) {
2784
8.26k
    if (X86::GR64RegClass.contains(SrcReg)) {
2785
170
      assert(Subtarget.hasBWI());
2786
170
      return X86::KMOVQkr;
2787
170
    }
2788
8.09k
    if (X86::GR32RegClass.contains(SrcReg))
2789
8.09k
      return Subtarget.hasBWI() ? 
X86::KMOVDkr2.66k
:
X86::KMOVWkr5.43k
;
2790
852
  }
2791
852
2792
852
2793
852
  // SrcReg(VR128) -> DestReg(GR64)
2794
852
  // SrcReg(VR64)  -> DestReg(GR64)
2795
852
  // SrcReg(GR64)  -> DestReg(VR128)
2796
852
  // SrcReg(GR64)  -> DestReg(VR64)
2797
852
2798
852
  if (X86::GR64RegClass.contains(DestReg)) {
2799
6
    if (X86::VR128XRegClass.contains(SrcReg))
2800
5
      // Copy from a VR128 register to a GR64 register.
2801
5
      return HasAVX512 ? 
X86::VMOVPQIto64Zrr0
:
2802
5
             HasAVX    ? 
X86::VMOVPQIto64rr0
:
2803
5
                         X86::MOVPQIto64rr;
2804
1
    if (X86::VR64RegClass.contains(SrcReg))
2805
1
      // Copy from a VR64 register to a GR64 register.
2806
1
      return X86::MMX_MOVD64from64rr;
2807
846
  } else if (X86::GR64RegClass.contains(SrcReg)) {
2808
119
    // Copy from a GR64 register to a VR128 register.
2809
119
    if (X86::VR128XRegClass.contains(DestReg))
2810
118
      return HasAVX512 ? 
X86::VMOV64toPQIZrr7
:
2811
118
             
HasAVX 111
?
X86::VMOV64toPQIrr108
:
2812
111
                         
X86::MOV64toPQIrr3
;
2813
1
    // Copy from a GR64 register to a VR64 register.
2814
1
    if (X86::VR64RegClass.contains(DestReg))
2815
1
      return X86::MMX_MOVD64to64rr;
2816
727
  }
2817
727
2818
727
  // SrcReg(VR128) -> DestReg(GR32)
2819
727
  // SrcReg(GR32)  -> DestReg(VR128)
2820
727
2821
727
  if (X86::GR32RegClass.contains(DestReg) &&
2822
727
      
X86::VR128XRegClass.contains(SrcReg)10
)
2823
10
    // Copy from a VR128 register to a GR32 register.
2824
10
    return HasAVX512 ? 
X86::VMOVPDI2DIZrr2
:
2825
10
           
HasAVX 8
?
X86::VMOVPDI2DIrr1
:
2826
8
                       
X86::MOVPDI2DIrr7
;
2827
717
2828
717
  if (X86::VR128XRegClass.contains(DestReg) &&
2829
717
      X86::GR32RegClass.contains(SrcReg))
2830
717
    // Copy from a VR128 register to a VR128 register.
2831
717
    return HasAVX512 ? 
X86::VMOVDI2PDIZrr278
:
2832
717
           
HasAVX 439
?
X86::VMOVDI2PDIrr434
:
2833
439
                       
X86::MOVDI2PDIrr5
;
2834
0
  return 0;
2835
0
}
2836
2837
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
2838
                               MachineBasicBlock::iterator MI,
2839
                               const DebugLoc &DL, unsigned DestReg,
2840
254k
                               unsigned SrcReg, bool KillSrc) const {
2841
254k
  // First deal with the normal symmetric copies.
2842
254k
  bool HasAVX = Subtarget.hasAVX();
2843
254k
  bool HasVLX = Subtarget.hasVLX();
2844
254k
  unsigned Opc = 0;
2845
254k
  if (X86::GR64RegClass.contains(DestReg, SrcReg))
2846
177k
    Opc = X86::MOV64rr;
2847
76.3k
  else if (X86::GR32RegClass.contains(DestReg, SrcReg))
2848
36.4k
    Opc = X86::MOV32rr;
2849
39.9k
  else if (X86::GR16RegClass.contains(DestReg, SrcReg))
2850
402
    Opc = X86::MOV16rr;
2851
39.5k
  else if (X86::GR8RegClass.contains(DestReg, SrcReg)) {
2852
3.24k
    // Copying to or from a physical H register on x86-64 requires a NOREX
2853
3.24k
    // move.  Otherwise use a normal move.
2854
3.24k
    if ((isHReg(DestReg) || 
isHReg(SrcReg)3.19k
) &&
2855
3.24k
        
Subtarget.is64Bit()134
) {
2856
0
      Opc = X86::MOV8rr_NOREX;
2857
0
      // Both operands must be encodable without an REX prefix.
2858
0
      assert(X86::GR8_NOREXRegClass.contains(SrcReg, DestReg) &&
2859
0
             "8-bit H register can not be copied outside GR8_NOREX");
2860
0
    } else
2861
3.24k
      Opc = X86::MOV8rr;
2862
3.24k
  }
2863
36.2k
  else if (X86::VR64RegClass.contains(DestReg, SrcReg))
2864
12
    Opc = X86::MMX_MOVQ64rr;
2865
36.2k
  else if (X86::VR128XRegClass.contains(DestReg, SrcReg)) {
2866
19.3k
    if (HasVLX)
2867
957
      Opc = X86::VMOVAPSZ128rr;
2868
18.4k
    else if (X86::VR128RegClass.contains(DestReg, SrcReg))
2869
18.4k
      Opc = HasAVX ? 
X86::VMOVAPSrr769
:
X86::MOVAPSrr17.6k
;
2870
8
    else {
2871
8
      // If this an extended register and we don't have VLX we need to use a
2872
8
      // 512-bit move.
2873
8
      Opc = X86::VMOVAPSZrr;
2874
8
      const TargetRegisterInfo *TRI = &getRegisterInfo();
2875
8
      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_xmm,
2876
8
                                         &X86::VR512RegClass);
2877
8
      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm,
2878
8
                                        &X86::VR512RegClass);
2879
8
    }
2880
19.3k
  } else 
if (16.8k
X86::VR256XRegClass.contains(DestReg, SrcReg)16.8k
) {
2881
1.44k
    if (HasVLX)
2882
1.04k
      Opc = X86::VMOVAPSZ256rr;
2883
404
    else if (X86::VR256RegClass.contains(DestReg, SrcReg))
2884
403
      Opc = X86::VMOVAPSYrr;
2885
1
    else {
2886
1
      // If this an extended register and we don't have VLX we need to use a
2887
1
      // 512-bit move.
2888
1
      Opc = X86::VMOVAPSZrr;
2889
1
      const TargetRegisterInfo *TRI = &getRegisterInfo();
2890
1
      DestReg = TRI->getMatchingSuperReg(DestReg, X86::sub_ymm,
2891
1
                                         &X86::VR512RegClass);
2892
1
      SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm,
2893
1
                                        &X86::VR512RegClass);
2894
1
    }
2895
15.4k
  } else if (X86::VR512RegClass.contains(DestReg, SrcReg))
2896
1.51k
    Opc = X86::VMOVAPSZrr;
2897
13.9k
  // All KMASK RegClasses hold the same k registers, can be tested against anyone.
2898
13.9k
  else if (X86::VK16RegClass.contains(DestReg, SrcReg))
2899
101
    Opc = Subtarget.hasBWI() ? 
X86::KMOVQkk77
:
X86::KMOVWkk24
;
2900
254k
  if (!Opc)
2901
13.8k
    Opc = CopyToFromAsymmetricReg(DestReg, SrcReg, Subtarget);
2902
254k
2903
254k
  if (Opc) {
2904
254k
    BuildMI(MBB, MI, DL, get(Opc), DestReg)
2905
254k
      .addReg(SrcReg, getKillRegState(KillSrc));
2906
254k
    return;
2907
254k
  }
2908
0
2909
0
  if (SrcReg == X86::EFLAGS || DestReg == X86::EFLAGS) {
2910
0
    // FIXME: We use a fatal error here because historically LLVM has tried
2911
0
    // lower some of these physreg copies and we want to ensure we get
2912
0
    // reasonable bug reports if someone encounters a case no other testing
2913
0
    // found. This path should be removed after the LLVM 7 release.
2914
0
    report_fatal_error("Unable to copy EFLAGS physical register!");
2915
0
  }
2916
0
2917
0
  LLVM_DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg) << " to "
2918
0
                    << RI.getName(DestReg) << '\n');
2919
0
  report_fatal_error("Cannot emit physreg copy instruction");
2920
0
}
2921
2922
bool X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI,
2923
                                   const MachineOperand *&Src,
2924
2.25k
                                   const MachineOperand *&Dest) const {
2925
2.25k
  if (MI.isMoveReg()) {
2926
153
    Dest = &MI.getOperand(0);
2927
153
    Src = &MI.getOperand(1);
2928
153
    return true;
2929
153
  }
2930
2.09k
  return false;
2931
2.09k
}
2932
2933
static unsigned getLoadStoreRegOpcode(unsigned Reg,
2934
                                      const TargetRegisterClass *RC,
2935
                                      bool isStackAligned,
2936
                                      const X86Subtarget &STI,
2937
64.6k
                                      bool load) {
2938
64.6k
  bool HasAVX = STI.hasAVX();
2939
64.6k
  bool HasAVX512 = STI.hasAVX512();
2940
64.6k
  bool HasVLX = STI.hasVLX();
2941
64.6k
2942
64.6k
  switch (STI.getRegisterInfo()->getSpillSize(*RC)) {
2943
64.6k
  default:
2944
0
    llvm_unreachable("Unknown spill size");
2945
64.6k
  case 1:
2946
1.49k
    assert(X86::GR8RegClass.hasSubClassEq(RC) && "Unknown 1-byte regclass");
2947
1.49k
    if (STI.is64Bit())
2948
768
      // Copying to or from a physical H register on x86-64 requires a NOREX
2949
768
      // move.  Otherwise use a normal move.
2950
768
      if (isHReg(Reg) || X86::GR8_ABCD_HRegClass.hasSubClassEq(RC))
2951
0
        return load ? X86::MOV8rm_NOREX : X86::MOV8mr_NOREX;
2952
1.49k
    return load ? 
X86::MOV8rm870
:
X86::MOV8mr629
;
2953
1.49k
  case 2:
2954
313
    if (X86::VK16RegClass.hasSubClassEq(RC))
2955
78
      return load ? 
X86::KMOVWkm39
:
X86::KMOVWmk39
;
2956
235
    assert(X86::GR16RegClass.hasSubClassEq(RC) && "Unknown 2-byte regclass");
2957
235
    return load ? 
X86::MOV16rm154
:
X86::MOV16mr81
;
2958
22.0k
  case 4:
2959
22.0k
    if (X86::GR32RegClass.hasSubClassEq(RC))
2960
21.0k
      return load ? 
X86::MOV32rm11.4k
:
X86::MOV32mr9.58k
;
2961
1.06k
    if (X86::FR32XRegClass.hasSubClassEq(RC))
2962
516
      return load ?
2963
249
        (HasAVX512 ? 
X86::VMOVSSZrm_alt5
:
2964
249
         
HasAVX 244
?
X86::VMOVSSrm_alt55
:
2965
244
                     
X86::MOVSSrm_alt189
) :
2966
516
        
(HasAVX512 267
?
X86::VMOVSSZmr5
:
2967
267
         
HasAVX 262
?
X86::VMOVSSmr75
:
2968
262
                     
X86::MOVSSmr187
);
2969
551
    if (X86::RFP32RegClass.hasSubClassEq(RC))
2970
529
      return load ? 
X86::LD_Fp32m279
:
X86::ST_Fp32m250
;
2971
22
    if (X86::VK32RegClass.hasSubClassEq(RC)) {
2972
2
      assert(STI.hasBWI() && "KMOVD requires BWI");
2973
2
      return load ? 
X86::KMOVDkm1
:
X86::KMOVDmk1
;
2974
2
    }
2975
20
    // All of these mask pair classes have the same spill size, the same kind
2976
20
    // of kmov instructions can be used with all of them.
2977
20
    if (X86::VK1PAIRRegClass.hasSubClassEq(RC) ||
2978
20
        
X86::VK2PAIRRegClass.hasSubClassEq(RC)0
||
2979
20
        
X86::VK4PAIRRegClass.hasSubClassEq(RC)0
||
2980
20
        
X86::VK8PAIRRegClass.hasSubClassEq(RC)0
||
2981
20
        
X86::VK16PAIRRegClass.hasSubClassEq(RC)0
)
2982
20
      return load ? 
X86::MASKPAIR16LOAD10
:
X86::MASKPAIR16STORE10
;
2983
0
    llvm_unreachable("Unknown 4-byte regclass");
2984
30.7k
  case 8:
2985
30.7k
    if (X86::GR64RegClass.hasSubClassEq(RC))
2986
29.6k
      return load ? 
X86::MOV64rm18.0k
:
X86::MOV64mr11.5k
;
2987
1.19k
    if (X86::FR64XRegClass.hasSubClassEq(RC))
2988
741
      return load ?
2989
409
        (HasAVX512 ? 
X86::VMOVSDZrm_alt24
:
2990
409
         
HasAVX 385
?
X86::VMOVSDrm_alt54
:
2991
385
                     
X86::MOVSDrm_alt331
) :
2992
741
        
(HasAVX512 332
?
X86::VMOVSDZmr37
:
2993
332
         
HasAVX 295
?
X86::VMOVSDmr78
:
2994
295
                     
X86::MOVSDmr217
);
2995
452
    if (X86::VR64RegClass.hasSubClassEq(RC))
2996
102
      return load ? 
X86::MMX_MOVQ64rm1
:
X86::MMX_MOVQ64mr101
;
2997
350
    if (X86::RFP64RegClass.hasSubClassEq(RC))
2998
300
      return load ? 
X86::LD_Fp64m175
:
X86::ST_Fp64m125
;
2999
50
    if (X86::VK64RegClass.hasSubClassEq(RC)) {
3000
50
      assert(STI.hasBWI() && "KMOVQ requires BWI");
3001
50
      return load ? 
X86::KMOVQkm25
:
X86::KMOVQmk25
;
3002
50
    }
3003
0
    llvm_unreachable("Unknown 8-byte regclass");
3004
1.62k
  case 10:
3005
1.62k
    assert(X86::RFP80RegClass.hasSubClassEq(RC) && "Unknown 10-byte regclass");
3006
1.62k
    return load ? 
X86::LD_Fp80m904
:
X86::ST_FpP80m716
;
3007
6.35k
  case 16: {
3008
6.35k
    if (X86::VR128XRegClass.hasSubClassEq(RC)) {
3009
6.35k
      // If stack is realigned we can use aligned stores.
3010
6.35k
      if (isStackAligned)
3011
6.24k
        return load ?
3012
3.09k
          (HasVLX    ? 
X86::VMOVAPSZ128rm154
:
3013
3.09k
           
HasAVX512 2.93k
?
X86::VMOVAPSZ128rm_NOVLX79
:
3014
2.93k
           
HasAVX 2.85k
?
X86::VMOVAPSrm413
:
3015
2.85k
                       
X86::MOVAPSrm2.44k
):
3016
6.24k
          
(HasVLX 3.15k
?
X86::VMOVAPSZ128mr311
:
3017
3.15k
           
HasAVX512 2.84k
?
X86::VMOVAPSZ128mr_NOVLX114
:
3018
2.84k
           
HasAVX 2.72k
?
X86::VMOVAPSmr813
:
3019
2.72k
                       
X86::MOVAPSmr1.91k
);
3020
116
      else
3021
116
        return load ?
3022
57
          (HasVLX    ? 
X86::VMOVUPSZ128rm5
:
3023
57
           
HasAVX512 52
?
X86::VMOVUPSZ128rm_NOVLX20
:
3024
52
           
HasAVX 32
?
X86::VMOVUPSrm0
:
3025
32
                       X86::MOVUPSrm):
3026
116
          
(HasVLX 59
?
X86::VMOVUPSZ128mr5
:
3027
59
           
HasAVX512 54
?
X86::VMOVUPSZ128mr_NOVLX20
:
3028
54
           
HasAVX 34
?
X86::VMOVUPSmr1
:
3029
34
                       
X86::MOVUPSmr33
);
3030
0
    }
3031
0
    if (X86::BNDRRegClass.hasSubClassEq(RC)) {
3032
0
      if (STI.is64Bit())
3033
0
        return load ? X86::BNDMOV64rm : X86::BNDMOV64mr;
3034
0
      else
3035
0
        return load ? X86::BNDMOV32rm : X86::BNDMOV32mr;
3036
0
    }
3037
0
    llvm_unreachable("Unknown 16-byte regclass");
3038
0
  }
3039
1.06k
  case 32:
3040
1.06k
    assert(X86::VR256XRegClass.hasSubClassEq(RC) && "Unknown 32-byte regclass");
3041
1.06k
    // If stack is realigned we can use aligned stores.
3042
1.06k
    if (isStackAligned)
3043
275
      return load ?
3044
165
        (HasVLX    ? 
X86::VMOVAPSZ256rm40
:
3045
165
         
HasAVX512 125
?
X86::VMOVAPSZ256rm_NOVLX0
:
3046
125
                     X86::VMOVAPSYrm) :
3047
275
        
(HasVLX 110
?
X86::VMOVAPSZ256mr0
:
3048
110
         HasAVX512 ? 
X86::VMOVAPSZ256mr_NOVLX0
:
3049
110
                     X86::VMOVAPSYmr);
3050
787
    else
3051
787
      return load ?
3052
248
        (HasVLX    ? 
X86::VMOVUPSZ256rm42
:
3053
248
         
HasAVX512 206
?
X86::VMOVUPSZ256rm_NOVLX29
:
3054
206
                     
X86::VMOVUPSYrm177
) :
3055
787
        
(HasVLX 539
?
X86::VMOVUPSZ256mr199
:
3056
539
         
HasAVX512 340
?
X86::VMOVUPSZ256mr_NOVLX27
:
3057
340
                     
X86::VMOVUPSYmr313
);
3058
908
  case 64:
3059
908
    assert(X86::VR512RegClass.hasSubClassEq(RC) && "Unknown 64-byte regclass");
3060
908
    assert(STI.hasAVX512() && "Using 512-bit register requires AVX512");
3061
908
    if (isStackAligned)
3062
116
      return load ? 
X86::VMOVAPSZrm80
:
X86::VMOVAPSZmr36
;
3063
792
    else
3064
792
      return load ? 
X86::VMOVUPSZrm176
:
X86::VMOVUPSZmr616
;
3065
64.6k
  }
3066
64.6k
}
3067
3068
bool X86InstrInfo::getMemOperandWithOffset(
3069
    const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset,
3070
105
    const TargetRegisterInfo *TRI) const {
3071
105
  const MCInstrDesc &Desc = MemOp.getDesc();
3072
105
  int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
3073
105
  if (MemRefBegin < 0)
3074
39
    return false;
3075
66
3076
66
  MemRefBegin += X86II::getOperandBias(Desc);
3077
66
3078
66
  BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
3079
66
  if (!BaseOp->isReg()) // Can be an MO_FrameIndex
3080
1
    return false;
3081
65
3082
65
  if (MemOp.getOperand(MemRefBegin + X86::AddrScaleAmt).getImm() != 1)
3083
0
    return false;
3084
65
3085
65
  if (MemOp.getOperand(MemRefBegin + X86::AddrIndexReg).getReg() !=
3086
65
      X86::NoRegister)
3087
0
    return false;
3088
65
3089
65
  const MachineOperand &DispMO = MemOp.getOperand(MemRefBegin + X86::AddrDisp);
3090
65
3091
65
  // Displacement can be symbolic
3092
65
  if (!DispMO.isImm())
3093
0
    return false;
3094
65
3095
65
  Offset = DispMO.getImm();
3096
65
3097
65
  assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
3098
65
                            "operands of type register.");
3099
65
  return true;
3100
65
}
3101
3102
static unsigned getStoreRegOpcode(unsigned SrcReg,
3103
                                  const TargetRegisterClass *RC,
3104
                                  bool isStackAligned,
3105
28.2k
                                  const X86Subtarget &STI) {
3106
28.2k
  return getLoadStoreRegOpcode(SrcReg, RC, isStackAligned, STI, false);
3107
28.2k
}
3108
3109
3110
static unsigned getLoadRegOpcode(unsigned DestReg,
3111
                                 const TargetRegisterClass *RC,
3112
                                 bool isStackAligned,
3113
36.4k
                                 const X86Subtarget &STI) {
3114
36.4k
  return getLoadStoreRegOpcode(DestReg, RC, isStackAligned, STI, true);
3115
36.4k
}
3116
3117
void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
3118
                                       MachineBasicBlock::iterator MI,
3119
                                       unsigned SrcReg, bool isKill, int FrameIdx,
3120
                                       const TargetRegisterClass *RC,
3121
28.2k
                                       const TargetRegisterInfo *TRI) const {
3122
28.2k
  const MachineFunction &MF = *MBB.getParent();
3123
28.2k
  assert(MF.getFrameInfo().getObjectSize(FrameIdx) >= TRI->getSpillSize(*RC) &&
3124
28.2k
         "Stack slot too small for store");
3125
28.2k
  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3126
28.2k
  bool isAligned =
3127
28.2k
      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
3128
28.2k
      
RI.canRealignStack(MF)5.09k
;
3129
28.2k
  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3130
28.2k
  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
3131
28.2k
    .addReg(SrcReg, getKillRegState(isKill));
3132
28.2k
}
3133
3134
void X86InstrInfo::storeRegToAddr(
3135
    MachineFunction &MF, unsigned SrcReg, bool isKill,
3136
    SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
3137
    ArrayRef<MachineMemOperand *> MMOs,
3138
0
    SmallVectorImpl<MachineInstr *> &NewMIs) const {
3139
0
  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3140
0
  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
3141
0
  bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
3142
0
  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
3143
0
  DebugLoc DL;
3144
0
  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
3145
0
  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
3146
0
    MIB.add(Addr[i]);
3147
0
  MIB.addReg(SrcReg, getKillRegState(isKill));
3148
0
  MIB.setMemRefs(MMOs);
3149
0
  NewMIs.push_back(MIB);
3150
0
}
3151
3152
3153
void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
3154
                                        MachineBasicBlock::iterator MI,
3155
                                        unsigned DestReg, int FrameIdx,
3156
                                        const TargetRegisterClass *RC,
3157
31.8k
                                        const TargetRegisterInfo *TRI) const {
3158
31.8k
  const MachineFunction &MF = *MBB.getParent();
3159
31.8k
  unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
3160
31.8k
  bool isAligned =
3161
31.8k
      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
3162
31.8k
      
RI.canRealignStack(MF)3.76k
;
3163
31.8k
  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3164
31.8k
  addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
3165
31.8k
}
3166
3167
void X86InstrInfo::loadRegFromAddr(
3168
    MachineFunction &MF, unsigned DestReg,
3169
    SmallVectorImpl<MachineOperand> &Addr, const TargetRegisterClass *RC,
3170
    ArrayRef<MachineMemOperand *> MMOs,
3171
3.92k
    SmallVectorImpl<MachineInstr *> &NewMIs) const {
3172
3.92k
  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
3173
3.92k
  unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
3174
3.92k
  bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
3175
3.92k
  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
3176
3.92k
  DebugLoc DL;
3177
3.92k
  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
3178
23.5k
  for (unsigned i = 0, e = Addr.size(); i != e; 
++i19.6k
)
3179
19.6k
    MIB.add(Addr[i]);
3180
3.92k
  MIB.setMemRefs(MMOs);
3181
3.92k
  NewMIs.push_back(MIB);
3182
3.92k
}
3183
3184
bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
3185
                                  unsigned &SrcReg2, int &CmpMask,
3186
163k
                                  int &CmpValue) const {
3187
163k
  switch (MI.getOpcode()) {
3188
163k
  
default: break37.7k
;
3189
163k
  case X86::CMP64ri32:
3190
144
  case X86::CMP64ri8:
3191
144
  case X86::CMP32ri:
3192
144
  case X86::CMP32ri8:
3193
144
  case X86::CMP16ri:
3194
144
  case X86::CMP16ri8:
3195
144
  case X86::CMP8ri:
3196
144
    SrcReg = MI.getOperand(0).getReg();
3197
144
    SrcReg2 = 0;
3198
144
    if (MI.getOperand(1).isImm()) {
3199
144
      CmpMask = ~0;
3200
144
      CmpValue = MI.getOperand(1).getImm();
3201
144
    } else {
3202
0
      CmpMask = CmpValue = 0;
3203
0
    }
3204
144
    return true;
3205
144
  // A SUB can be used to perform comparison.
3206
3.59k
  case X86::SUB64rm:
3207
3.59k
  case X86::SUB32rm:
3208
3.59k
  case X86::SUB16rm:
3209
3.59k
  case X86::SUB8rm:
3210
3.59k
    SrcReg = MI.getOperand(1).getReg();
3211
3.59k
    SrcReg2 = 0;
3212
3.59k
    CmpMask = 0;
3213
3.59k
    CmpValue = 0;
3214
3.59k
    return true;
3215
45.3k
  case X86::SUB64rr:
3216
45.3k
  case X86::SUB32rr:
3217
45.3k
  case X86::SUB16rr:
3218
45.3k
  case X86::SUB8rr:
3219
45.3k
    SrcReg = MI.getOperand(1).getReg();
3220
45.3k
    SrcReg2 = MI.getOperand(2).getReg();
3221
45.3k
    CmpMask = 0;
3222
45.3k
    CmpValue = 0;
3223
45.3k
    return true;
3224
45.3k
  case X86::SUB64ri32:
3225
21.5k
  case X86::SUB64ri8:
3226
21.5k
  case X86::SUB32ri:
3227
21.5k
  case X86::SUB32ri8:
3228
21.5k
  case X86::SUB16ri:
3229
21.5k
  case X86::SUB16ri8:
3230
21.5k
  case X86::SUB8ri:
3231
21.5k
    SrcReg = MI.getOperand(1).getReg();
3232
21.5k
    SrcReg2 = 0;
3233
21.5k
    if (MI.getOperand(2).isImm()) {
3234
21.5k
      CmpMask = ~0;
3235
21.5k
      CmpValue = MI.getOperand(2).getImm();
3236
21.5k
    } else {
3237
8
      CmpMask = CmpValue = 0;
3238
8
    }
3239
21.5k
    return true;
3240
21.5k
  case X86::CMP64rr:
3241
136
  case X86::CMP32rr:
3242
136
  case X86::CMP16rr:
3243
136
  case X86::CMP8rr:
3244
136
    SrcReg = MI.getOperand(0).getReg();
3245
136
    SrcReg2 = MI.getOperand(1).getReg();
3246
136
    CmpMask = 0;
3247
136
    CmpValue = 0;
3248
136
    return true;
3249
54.7k
  case X86::TEST8rr:
3250
54.7k
  case X86::TEST16rr:
3251
54.7k
  case X86::TEST32rr:
3252
54.7k
  case X86::TEST64rr:
3253
54.7k
    SrcReg = MI.getOperand(0).getReg();
3254
54.7k
    if (MI.getOperand(1).getReg() != SrcReg)
3255
3.77k
      return false;
3256
51.0k
    // Compare against zero.
3257
51.0k
    SrcReg2 = 0;
3258
51.0k
    CmpMask = ~0;
3259
51.0k
    CmpValue = 0;
3260
51.0k
    return true;
3261
37.7k
  }
3262
37.7k
  return false;
3263
37.7k
}
3264
3265
/// Check whether the first instruction, whose only
3266
/// purpose is to update flags, can be made redundant.
3267
/// CMPrr can be made redundant by SUBrr if the operands are the same.
3268
/// This function can be extended later on.
3269
/// SrcReg, SrcRegs: register operands for FlagI.
3270
/// ImmValue: immediate for FlagI if it takes an immediate.
3271
inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
3272
                                        unsigned SrcReg, unsigned SrcReg2,
3273
                                        int ImmMask, int ImmValue,
3274
29.4k
                                        const MachineInstr &OI) {
3275
29.4k
  if (((FlagI.getOpcode() == X86::CMP64rr && 
OI.getOpcode() == X86::SUB64rr17.7k
) ||
3276
29.4k
       
(29.1k
FlagI.getOpcode() == X86::CMP32rr29.1k
&&
OI.getOpcode() == X86::SUB32rr2.96k
) ||
3277
29.4k
       
(29.0k
FlagI.getOpcode() == X86::CMP16rr29.0k
&&
OI.getOpcode() == X86::SUB16rr97
) ||
3278
29.4k
       
(29.0k
FlagI.getOpcode() == X86::CMP8rr29.0k
&&
OI.getOpcode() == X86::SUB8rr396
)) &&
3279
29.4k
      
(341
(341
OI.getOperand(1).getReg() == SrcReg341
&&
3280
341
        
OI.getOperand(2).getReg() == SrcReg23
) ||
3281
341
       
(338
OI.getOperand(1).getReg() == SrcReg2338
&&
3282
338
        
OI.getOperand(2).getReg() == SrcReg0
)))
3283
3
    return true;
3284
29.4k
3285
29.4k
  if (ImmMask != 0 &&
3286
29.4k
      
(8.24k
(8.24k
FlagI.getOpcode() == X86::CMP64ri328.24k
&&
3287
8.24k
        
OI.getOpcode() == X86::SUB64ri32604
) ||
3288
8.24k
       (FlagI.getOpcode() == X86::CMP64ri8 &&
3289
8.24k
        
OI.getOpcode() == X86::SUB64ri83.38k
) ||
3290
8.24k
       
(8.24k
FlagI.getOpcode() == X86::CMP32ri8.24k
&&
OI.getOpcode() == X86::SUB32ri753
) ||
3291
8.24k
       
(8.24k
FlagI.getOpcode() == X86::CMP32ri88.24k
&&
3292
8.24k
        
OI.getOpcode() == X86::SUB32ri82.74k
) ||
3293
8.24k
       
(8.24k
FlagI.getOpcode() == X86::CMP16ri8.24k
&&
OI.getOpcode() == X86::SUB16ri0
) ||
3294
8.24k
       
(8.24k
FlagI.getOpcode() == X86::CMP16ri88.24k
&&
3295
8.24k
        
OI.getOpcode() == X86::SUB16ri836
) ||
3296
8.24k
       
(8.24k
FlagI.getOpcode() == X86::CMP8ri8.24k
&&
OI.getOpcode() == X86::SUB8ri726
)) &&
3297
29.4k
      
OI.getOperand(1).getReg() == SrcReg4
&&
3298
29.4k
      
OI.getOperand(2).getImm() == ImmValue4
)
3299
4
    return true;
3300
29.4k
  return false;
3301
29.4k
}
3302
3303
/// Check whether the definition can be converted
3304
/// to remove a comparison against zero.
3305
39.4k
inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag) {
3306
39.4k
  NoSignFlag = false;
3307
39.4k
3308
39.4k
  switch (MI.getOpcode()) {
3309
39.4k
  
default: return false37.3k
;
3310
39.4k
3311
39.4k
  // The shift instructions only modify ZF if their shift count is non-zero.
3312
39.4k
  // N.B.: The processor truncates the shift count depending on the encoding.
3313
39.4k
  
case X86::SAR8ri: 468
case X86::SAR16ri: 468
case X86::SAR32ri:468
case X86::SAR64ri:
3314
468
  case X86::SHR8ri:    case X86::SHR16ri:  case X86::SHR32ri:case X86::SHR64ri:
3315
468
     return getTruncatedShiftCount(MI, 2) != 0;
3316
468
3317
468
  // Some left shift instructions can be turned into LEA instructions but only
3318
468
  // if their flags aren't used. Avoid transforming such instructions.
3319
468
  
case X86::SHL8ri: 7
case X86::SHL16ri: 7
case X86::SHL32ri:7
case X86::SHL64ri:{
3320
7
    unsigned ShAmt = getTruncatedShiftCount(MI, 2);
3321
7
    if (isTruncatedShiftCountForLEA(ShAmt)) 
return false3
;
3322
4
    return ShAmt != 0;
3323
4
  }
3324
4
3325
4
  
case X86::SHRD16rri8:0
case X86::SHRD32rri8:0
case X86::SHRD64rri8:
3326
0
  case X86::SHLD16rri8:case X86::SHLD32rri8:case X86::SHLD64rri8:
3327
0
     return getTruncatedShiftCount(MI, 3) != 0;
3328
0
3329
1.66k
  case X86::SUB64ri32: case X86::SUB64ri8: case X86::SUB32ri:
3330
1.66k
  case X86::SUB32ri8:  case X86::SUB16ri:  case X86::SUB16ri8:
3331
1.66k
  case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
3332
1.66k
  case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
3333
1.66k
  case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
3334
1.66k
  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
3335
1.66k
  case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
3336
1.66k
  case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
3337
1.66k
  case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
3338
1.66k
  case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
3339
1.66k
  case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
3340
1.66k
  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
3341
1.66k
  case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
3342
1.66k
  case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
3343
1.66k
  case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
3344
1.66k
  case X86::AND16rr:   case X86::AND8rr:   case X86::AND64rm:
3345
1.66k
  case X86::AND32rm:   case X86::AND16rm:  case X86::AND8rm:
3346
1.66k
  case X86::XOR64ri32: case X86::XOR64ri8: case X86::XOR32ri:
3347
1.66k
  case X86::XOR32ri8:  case X86::XOR16ri:  case X86::XOR16ri8:
3348
1.66k
  case X86::XOR8ri:    case X86::XOR64rr:  case X86::XOR32rr:
3349
1.66k
  case X86::XOR16rr:   case X86::XOR8rr:   case X86::XOR64rm:
3350
1.66k
  case X86::XOR32rm:   case X86::XOR16rm:  case X86::XOR8rm:
3351
1.66k
  case X86::OR64ri32:  case X86::OR64ri8:  case X86::OR32ri:
3352
1.66k
  case X86::OR32ri8:   case X86::OR16ri:   case X86::OR16ri8:
3353
1.66k
  case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
3354
1.66k
  case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
3355
1.66k
  case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
3356
1.66k
  case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
3357
1.66k
  case X86::ADC32ri8:  case X86::ADC16ri:  case X86::ADC16ri8:
3358
1.66k
  case X86::ADC8ri:    case X86::ADC64rr:  case X86::ADC32rr:
3359
1.66k
  case X86::ADC16rr:   case X86::ADC8rr:   case X86::ADC64rm:
3360
1.66k
  case X86::ADC32rm:   case X86::ADC16rm:  case X86::ADC8rm:
3361
1.66k
  case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
3362
1.66k
  case X86::SBB32ri8:  case X86::SBB16ri:  case X86::SBB16ri8:
3363
1.66k
  case X86::SBB8ri:    case X86::SBB64rr:  case X86::SBB32rr:
3364
1.66k
  case X86::SBB16rr:   case X86::SBB8rr:   case X86::SBB64rm:
3365
1.66k
  case X86::SBB32rm:   case X86::SBB16rm:  case X86::SBB8rm:
3366
1.66k
  case X86::NEG8r:     case X86::NEG16r:   case X86::NEG32r: case X86::NEG64r:
3367
1.66k
  case X86::SAR8r1:    case X86::SAR16r1:  case X86::SAR32r1:case X86::SAR64r1:
3368
1.66k
  case X86::SHR8r1:    case X86::SHR16r1:  case X86::SHR32r1:case X86::SHR64r1:
3369
1.66k
  case X86::SHL8r1:    case X86::SHL16r1:  case X86::SHL32r1:case X86::SHL64r1:
3370
1.66k
  case X86::ANDN32rr:  case X86::ANDN32rm:
3371
1.66k
  case X86::ANDN64rr:  case X86::ANDN64rm:
3372
1.66k
  case X86::BLSI32rr:  case X86::BLSI32rm:
3373
1.66k
  case X86::BLSI64rr:  case X86::BLSI64rm:
3374
1.66k
  case X86::BLSMSK32rr:case X86::BLSMSK32rm:
3375
1.66k
  case X86::BLSMSK64rr:case X86::BLSMSK64rm:
3376
1.66k
  case X86::BLSR32rr:  case X86::BLSR32rm:
3377
1.66k
  case X86::BLSR64rr:  case X86::BLSR64rm:
3378
1.66k
  case X86::BZHI32rr:  case X86::BZHI32rm:
3379
1.66k
  case X86::BZHI64rr:  case X86::BZHI64rm:
3380
1.66k
  case X86::LZCNT16rr: case X86::LZCNT16rm:
3381
1.66k
  case X86::LZCNT32rr: case X86::LZCNT32rm:
3382
1.66k
  case X86::LZCNT64rr: case X86::LZCNT64rm:
3383
1.66k
  case X86::POPCNT16rr:case X86::POPCNT16rm:
3384
1.66k
  case X86::POPCNT32rr:case X86::POPCNT32rm:
3385
1.66k
  case X86::POPCNT64rr:case X86::POPCNT64rm:
3386
1.66k
  case X86::TZCNT16rr: case X86::TZCNT16rm:
3387
1.66k
  case X86::TZCNT32rr: case X86::TZCNT32rm:
3388
1.66k
  case X86::TZCNT64rr: case X86::TZCNT64rm:
3389
1.66k
  case X86::BLCFILL32rr: case X86::BLCFILL32rm:
3390
1.66k
  case X86::BLCFILL64rr: case X86::BLCFILL64rm:
3391
1.66k
  case X86::BLCI32rr:    case X86::BLCI32rm:
3392
1.66k
  case X86::BLCI64rr:    case X86::BLCI64rm:
3393
1.66k
  case X86::BLCIC32rr:   case X86::BLCIC32rm:
3394
1.66k
  case X86::BLCIC64rr:   case X86::BLCIC64rm:
3395
1.66k
  case X86::BLCMSK32rr:  case X86::BLCMSK32rm:
3396
1.66k
  case X86::BLCMSK64rr:  case X86::BLCMSK64rm:
3397
1.66k
  case X86::BLCS32rr:    case X86::BLCS32rm:
3398
1.66k
  case X86::BLCS64rr:    case X86::BLCS64rm:
3399
1.66k
  case X86::BLSFILL32rr: case X86::BLSFILL32rm:
3400
1.66k
  case X86::BLSFILL64rr: case X86::BLSFILL64rm:
3401
1.66k
  case X86::BLSIC32rr:   case X86::BLSIC32rm:
3402
1.66k
  case X86::BLSIC64rr:   case X86::BLSIC64rm:
3403
1.66k
  case X86::T1MSKC32rr:  case X86::T1MSKC32rm:
3404
1.66k
  case X86::T1MSKC64rr:  case X86::T1MSKC64rm:
3405
1.66k
  case X86::TZMSK32rr:   case X86::TZMSK32rm:
3406
1.66k
  case X86::TZMSK64rr:   case X86::TZMSK64rm:
3407
1.66k
    return true;
3408
1.66k
  
case X86::BEXTR32rr: 14
case X86::BEXTR64rr:
3409
14
  case X86::BEXTR32rm:   case X86::BEXTR64rm:
3410
14
  case X86::BEXTRI32ri:  case X86::BEXTRI32mi:
3411
14
  case X86::BEXTRI64ri:  case X86::BEXTRI64mi:
3412
14
    // BEXTR doesn't update the sign flag so we can't use it.
3413
14
    NoSignFlag = true;
3414
14
    return true;
3415
39.4k
  }
3416
39.4k
}
3417
3418
/// Check whether the use can be converted to remove a comparison against zero.
3419
102k
static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
3420
102k
  switch (MI.getOpcode()) {
3421
102k
  
default: return X86::COND_INVALID102k
;
3422
102k
  case X86::NEG8r:
3423
11
  case X86::NEG16r:
3424
11
  case X86::NEG32r:
3425
11
  case X86::NEG64r:
3426
11
    return X86::COND_AE;
3427
11
  case X86::LZCNT16rr:
3428
3
  case X86::LZCNT32rr:
3429
3
  case X86::LZCNT64rr:
3430
3
    return X86::COND_B;
3431
10
  case X86::POPCNT16rr:
3432
10
  case X86::POPCNT32rr:
3433
10
  case X86::POPCNT64rr:
3434
10
    return X86::COND_E;
3435
11
  case X86::TZCNT16rr:
3436
11
  case X86::TZCNT32rr:
3437
11
  case X86::TZCNT64rr:
3438
11
    return X86::COND_B;
3439
11
  case X86::BSF16rr:
3440
8
  case X86::BSF32rr:
3441
8
  case X86::BSF64rr:
3442
8
  case X86::BSR16rr:
3443
8
  case X86::BSR32rr:
3444
8
  case X86::BSR64rr:
3445
8
    return X86::COND_E;
3446
8
  case X86::BLSI32rr:
3447
6
  case X86::BLSI64rr:
3448
6
    return X86::COND_AE;
3449
6
  case X86::BLSR32rr:
3450
0
  case X86::BLSR64rr:
3451
0
  case X86::BLSMSK32rr:
3452
0
  case X86::BLSMSK64rr:
3453
0
    return X86::COND_B;
3454
102k
  // TODO: TBM instructions.
3455
102k
  }
3456
102k
}
3457
3458
/// Check if there exists an earlier instruction that
3459
/// operates on the same source operands and sets flags in the same way as
3460
/// Compare; remove Compare if possible.
3461
bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
3462
                                        unsigned SrcReg2, int CmpMask,
3463
                                        int CmpValue,
3464
121k
                                        const MachineRegisterInfo *MRI) const {
3465
121k
  // Check whether we can replace SUB with CMP.
3466
121k
  switch (CmpInstr.getOpcode()) {
3467
121k
  
default: break51.2k
;
3468
121k
  case X86::SUB64ri32:
3469
70.4k
  case X86::SUB64ri8:
3470
70.4k
  case X86::SUB32ri:
3471
70.4k
  case X86::SUB32ri8:
3472
70.4k
  case X86::SUB16ri:
3473
70.4k
  case X86::SUB16ri8:
3474
70.4k
  case X86::SUB8ri:
3475
70.4k
  case X86::SUB64rm:
3476
70.4k
  case X86::SUB32rm:
3477
70.4k
  case X86::SUB16rm:
3478
70.4k
  case X86::SUB8rm:
3479
70.4k
  case X86::SUB64rr:
3480
70.4k
  case X86::SUB32rr:
3481
70.4k
  case X86::SUB16rr:
3482
70.4k
  case X86::SUB8rr: {
3483
70.4k
    if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
3484
13.8k
      return false;
3485
56.6k
    // There is no use of the destination register, we can replace SUB with CMP.
3486
56.6k
    unsigned NewOpcode = 0;
3487
56.6k
    switch (CmpInstr.getOpcode()) {
3488
56.6k
    
default: 0
llvm_unreachable0
("Unreachable!");
3489
56.6k
    
case X86::SUB64rm: NewOpcode = X86::CMP64rm; break1.46k
;
3490
56.6k
    
case X86::SUB32rm: NewOpcode = X86::CMP32rm; break623
;
3491
56.6k
    
case X86::SUB16rm: NewOpcode = X86::CMP16rm; break29
;
3492
56.6k
    
case X86::SUB8rm: NewOpcode = X86::CMP8rm; break160
;
3493
56.6k
    
case X86::SUB64rr: NewOpcode = X86::CMP64rr; break26.1k
;
3494
56.6k
    
case X86::SUB32rr: NewOpcode = X86::CMP32rr; break6.60k
;
3495
56.6k
    
case X86::SUB16rr: NewOpcode = X86::CMP16rr; break90
;
3496
56.6k
    
case X86::SUB8rr: NewOpcode = X86::CMP8rr; break521
;
3497
56.6k
    
case X86::SUB64ri32: NewOpcode = X86::CMP64ri32; break1.87k
;
3498
56.6k
    
case X86::SUB64ri8: NewOpcode = X86::CMP64ri8; break6.16k
;
3499
56.6k
    
case X86::SUB32ri: NewOpcode = X86::CMP32ri; break2.17k
;
3500
56.6k
    
case X86::SUB32ri8: NewOpcode = X86::CMP32ri8; break8.01k
;
3501
56.6k
    
case X86::SUB16ri: NewOpcode = X86::CMP16ri; break12
;
3502
56.6k
    
case X86::SUB16ri8: NewOpcode = X86::CMP16ri8; break144
;
3503
56.6k
    
case X86::SUB8ri: NewOpcode = X86::CMP8ri; break2.60k
;
3504
56.6k
    }
3505
56.6k
    CmpInstr.setDesc(get(NewOpcode));
3506
56.6k
    CmpInstr.RemoveOperand(0);
3507
56.6k
    // Fall through to optimize Cmp if Cmp is CMPrr or CMPri.
3508
56.6k
    if (NewOpcode == X86::CMP64rm || 
NewOpcode == X86::CMP32rm55.1k
||
3509
56.6k
        
NewOpcode == X86::CMP16rm54.5k
||
NewOpcode == X86::CMP8rm54.5k
)
3510
2.27k
      return false;
3511
105k
  }
3512
105k
  }
3513
105k
3514
105k
  // Get the unique definition of SrcReg.
3515
105k
  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
3516
105k
  if (!MI) 
return false0
;
3517
105k
3518
105k
  // CmpInstr is the first instruction of the BB.
3519
105k
  MachineBasicBlock::iterator I = CmpInstr, Def = MI;
3520
105k
3521
105k
  // If we are comparing against zero, check whether we can use MI to update
3522
105k
  // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
3523
105k
  bool IsCmpZero = (CmpMask != 0 && 
CmpValue == 072.1k
);
3524
105k
  if (IsCmpZero && 
MI->getParent() != CmpInstr.getParent()51.0k
)
3525
11.5k
    return false;
3526
94.1k
3527
94.1k
  // If we have a use of the source register between the def and our compare
3528
94.1k
  // instruction we can eliminate the compare iff the use sets EFLAGS in the
3529
94.1k
  // right way.
3530
94.1k
  bool ShouldUpdateCC = false;
3531
94.1k
  bool NoSignFlag = false;
3532
94.1k
  X86::CondCode NewCC = X86::COND_INVALID;
3533
94.1k
  if (IsCmpZero && 
!isDefConvertible(*MI, NoSignFlag)39.4k
) {
3534
37.3k
    // Scan forward from the use until we hit the use we're looking for or the
3535
37.3k
    // compare instruction.
3536
102k
    for (MachineBasicBlock::iterator J = MI;; 
++J65.3k
) {
3537
102k
      // Do we have a convertible instruction?
3538
102k
      NewCC = isUseDefConvertible(*J);
3539
102k
      if (NewCC != X86::COND_INVALID && 
J->getOperand(1).isReg()49
&&
3540
102k
          
J->getOperand(1).getReg() == SrcReg49
) {
3541
37
        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
3542
37
        ShouldUpdateCC = true; // Update CC later on.
3543
37
        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
3544
37
        // with the new def.
3545
37
        Def = J;
3546
37
        MI = &*Def;
3547
37
        break;
3548
37
      }
3549
102k
3550
102k
      if (J == I)
3551
37.2k
        return false;
3552
102k
    }
3553
37.3k
  }
3554
94.1k
3555
94.1k
  // We are searching for an earlier instruction that can make CmpInstr
3556
94.1k
  // redundant and that instruction will be saved in Sub.
3557
94.1k
  MachineInstr *Sub = nullptr;
3558
56.8k
  const TargetRegisterInfo *TRI = &getRegisterInfo();
3559
56.8k
3560
56.8k
  // We iterate backward, starting from the instruction before CmpInstr and
3561
56.8k
  // stop when reaching the definition of a source register or done with the BB.
3562
56.8k
  // RI points to the instruction before CmpInstr.
3563
56.8k
  // If the definition is in this basic block, RE points to the definition;
3564
56.8k
  // otherwise, RE is the rend of the basic block.
3565
56.8k
  MachineBasicBlock::reverse_iterator
3566
56.8k
      RI = ++I.getReverse(),
3567
56.8k
      RE = CmpInstr.getParent() == MI->getParent()
3568
56.8k
               ? 
Def.getReverse()39.2k
/* points to MI */
3569
56.8k
               : 
CmpInstr.getParent()->rend()17.6k
;
3570
56.8k
  MachineInstr *Movr0Inst = nullptr;
3571
75.0k
  for (; RI != RE; 
++RI18.2k
) {
3572
30.2k
    MachineInstr &Instr = *RI;
3573
30.2k
    // Check whether CmpInstr can be made redundant by the current instruction.
3574
30.2k
    if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
3575
29.4k
                                           CmpValue, Instr)) {
3576
7
      Sub = &Instr;
3577
7
      break;
3578
7
    }
3579
30.1k
3580
30.1k
    if (Instr.modifiesRegister(X86::EFLAGS, TRI) ||
3581
30.1k
        
Instr.readsRegister(X86::EFLAGS, TRI)18.0k
) {
3582
12.8k
      // This instruction modifies or uses EFLAGS.
3583
12.8k
3584
12.8k
      // MOV32r0 etc. are implemented with xor which clobbers condition code.
3585
12.8k
      // They are safe to move up, if the definition to EFLAGS is dead and
3586
12.8k
      // earlier instructions do not read or write EFLAGS.
3587
12.8k
      if (!Movr0Inst && 
Instr.getOpcode() == X86::MOV32r012.5k
&&
3588
12.8k
          
Instr.registerDefIsDead(X86::EFLAGS, TRI)886
) {
3589
880
        Movr0Inst = &Instr;
3590
880
        continue;
3591
880
      }
3592
11.9k
3593
11.9k
      // We can't remove CmpInstr.
3594
11.9k
      return false;
3595
11.9k
    }
3596
30.1k
  }
3597
56.8k
3598
56.8k
  // Return false if no candidates exist.
3599
56.8k
  
if (44.8k
!IsCmpZero44.8k
&&
!Sub43.1k
)
3600
43.1k
    return false;
3601
1.70k
3602
1.70k
  bool IsSwapped = (SrcReg2 != 0 && 
Sub->getOperand(1).getReg() == SrcReg23
&&
3603
1.70k
                    
Sub->getOperand(2).getReg() == SrcReg0
);
3604
1.70k
3605
1.70k
  // Scan forward from the instruction after CmpInstr for uses of EFLAGS.
3606
1.70k
  // It is safe to remove CmpInstr if EFLAGS is redefined or killed.
3607
1.70k
  // If we are done with the basic block, we need to check whether EFLAGS is
3608
1.70k
  // live-out.
3609
1.70k
  bool IsSafe = false;
3610
1.70k
  SmallVector<std::pair<MachineInstr*, X86::CondCode>, 4> OpsToUpdate;
3611
1.70k
  MachineBasicBlock::iterator E = CmpInstr.getParent()->end();
3612
5.11k
  for (++I; I != E; 
++I3.41k
) {
3613
3.80k
    const MachineInstr &Instr = *I;
3614
3.80k
    bool ModifyEFLAGS = Instr.modifiesRegister(X86::EFLAGS, TRI);
3615
3.80k
    bool UseEFLAGS = Instr.readsRegister(X86::EFLAGS, TRI);
3616
3.80k
    // We should check the usage if this instruction uses and updates EFLAGS.
3617
3.80k
    if (!UseEFLAGS && 
ModifyEFLAGS2.09k
) {
3618
293
      // It is safe to remove CmpInstr if EFLAGS is updated again.
3619
293
      IsSafe = true;
3620
293
      break;
3621
293
    }
3622
3.51k
    if (!UseEFLAGS && 
!ModifyEFLAGS1.80k
)
3623
1.80k
      continue;
3624
1.70k
3625
1.70k
    // EFLAGS is used by this instruction.
3626
1.70k
    X86::CondCode OldCC = X86::COND_INVALID;
3627
1.70k
    if (IsCmpZero || 
IsSwapped7
) {
3628
1.70k
      // We decode the condition code from opcode.
3629
1.70k
      if (Instr.isBranch())
3630
1.24k
        OldCC = X86::getCondFromBranch(Instr);
3631
458
      else {
3632
458
        OldCC = X86::getCondFromSETCC(Instr);
3633
458
        if (OldCC == X86::COND_INVALID)
3634
295
          OldCC = X86::getCondFromCMov(Instr);
3635
458
      }
3636
1.70k
      if (OldCC == X86::COND_INVALID) 
return false0
;
3637
1.70k
    }
3638
1.70k
    X86::CondCode ReplacementCC = X86::COND_INVALID;
3639
1.70k
    if (IsCmpZero) {
3640
1.70k
      switch (OldCC) {
3641
1.70k
      
default: break1.43k
;
3642
1.70k
      
case X86::COND_A: 91
case X86::COND_AE:
3643
91
      case X86::COND_B: case X86::COND_BE:
3644
91
      case X86::COND_G: case X86::COND_GE:
3645
91
      case X86::COND_L: case X86::COND_LE:
3646
91
      case X86::COND_O: case X86::COND_NO:
3647
91
        // CF and OF are used, we can't perform this optimization.
3648
91
        return false;
3649
175
      case X86::COND_S: case X86::COND_NS:
3650
175
        // If SF is used, but the instruction doesn't update the SF, then we
3651
175
        // can't do the optimization.
3652
175
        if (NoSignFlag)
3653
6
          return false;
3654
169
        break;
3655
1.60k
      }
3656
1.60k
3657
1.60k
      // If we're updating the condition code check if we have to reverse the
3658
1.60k
      // condition.
3659
1.60k
      if (ShouldUpdateCC)
3660
32
        switch (OldCC) {
3661
32
        default:
3662
0
          return false;
3663
32
        case X86::COND_E:
3664
10
          ReplacementCC = NewCC;
3665
10
          break;
3666
32
        case X86::COND_NE:
3667
22
          ReplacementCC = GetOppositeBranchCondition(NewCC);
3668
22
          break;
3669
7
        }
3670
7
    } else if (IsSwapped) {
3671
0
      // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
3672
0
      // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
3673
0
      // We swap the condition code and synthesize the new opcode.
3674
0
      ReplacementCC = getSwappedCondition(OldCC);
3675
0
      if (ReplacementCC == X86::COND_INVALID) return false;
3676
1.61k
    }
3677
1.61k
3678
1.61k
    if ((ShouldUpdateCC || 
IsSwapped1.57k
) &&
ReplacementCC != OldCC32
) {
3679
20
      // Push the MachineInstr to OpsToUpdate.
3680
20
      // If it is safe to remove CmpInstr, the condition code of these
3681
20
      // instructions will be modified.
3682
20
      OpsToUpdate.push_back(std::make_pair(&*I, ReplacementCC));
3683
20
    }
3684
1.61k
    if (ModifyEFLAGS || Instr.killsRegister(X86::EFLAGS, TRI)) {
3685
0
      // It is safe to remove CmpInstr if EFLAGS is updated again or killed.
3686
0
      IsSafe = true;
3687
0
      break;
3688
0
    }
3689
1.61k
  }
3690
1.70k
3691
1.70k
  // If EFLAGS is not killed nor re-defined, we should check whether it is
3692
1.70k
  // live-out. If it is live-out, do not optimize.
3693
1.70k
  
if (1.60k
(1.60k
IsCmpZero1.60k
||
IsSwapped7
) &&
!IsSafe1.59k
) {
3694
1.30k
    MachineBasicBlock *MBB = CmpInstr.getParent();
3695
1.30k
    for (MachineBasicBlock *Successor : MBB->successors())
3696
2.33k
      if (Successor->isLiveIn(X86::EFLAGS))
3697
6
        return false;
3698
1.30k
  }
3699
1.60k
3700
1.60k
  // The instruction to be updated is either Sub or MI.
3701
1.60k
  
Sub = IsCmpZero 1.60k
?
MI1.59k
:
Sub7
;
3702
1.60k
  // Move Movr0Inst to the appropriate place before Sub.
3703
1.60k
  if (Movr0Inst) {
3704
16
    // Look backwards until we find a def that doesn't use the current EFLAGS.
3705
16
    Def = Sub;
3706
16
    MachineBasicBlock::reverse_iterator InsertI = Def.getReverse(),
3707
16
                                        InsertE = Sub->getParent()->rend();
3708
17
    for (; InsertI != InsertE; 
++InsertI1
) {
3709
17
      MachineInstr *Instr = &*InsertI;
3710
17
      if (!Instr->readsRegister(X86::EFLAGS, TRI) &&
3711
17
          
Instr->modifiesRegister(X86::EFLAGS, TRI)16
) {
3712
16
        Sub->getParent()->remove(Movr0Inst);
3713
16
        Instr->getParent()->insert(MachineBasicBlock::iterator(Instr),
3714
16
                                   Movr0Inst);
3715
16
        break;
3716
16
      }
3717
17
    }
3718
16
    if (InsertI == InsertE)
3719
0
      return false;
3720
1.60k
  }
3721
1.60k
3722
1.60k
  // Make sure Sub instruction defines EFLAGS and mark the def live.
3723
1.60k
  MachineOperand *FlagDef = Sub->findRegisterDefOperand(X86::EFLAGS);
3724
1.60k
  assert(FlagDef && "Unable to locate a def EFLAGS operand");
3725
1.60k
  FlagDef->setIsDead(false);
3726
1.60k
3727
1.60k
  CmpInstr.eraseFromParent();
3728
1.60k
3729
1.60k
  // Modify the condition code of instructions in OpsToUpdate.
3730
1.60k
  for (auto &Op : OpsToUpdate) {
3731
20
    Op.first->getOperand(Op.first->getDesc().getNumOperands() - 1)
3732
20
        .setImm(Op.second);
3733
20
  }
3734
1.60k
  return true;
3735
1.60k
}
3736
3737
/// Try to remove the load by folding it to a register
3738
/// operand at the use. We fold the load instructions if load defines a virtual
3739
/// register, the virtual register is used once in the same BB, and the
3740
/// instructions in-between do not load or store, and have no side effects.
3741
MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
3742
                                              const MachineRegisterInfo *MRI,
3743
                                              unsigned &FoldAsLoadDefReg,
3744
91.0k
                                              MachineInstr *&DefMI) const {
3745
91.0k
  // Check whether we can move DefMI here.
3746
91.0k
  DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
3747
91.0k
  assert(DefMI);
3748
91.0k
  bool SawStore = false;
3749
91.0k
  if (!DefMI->isSafeToMove(nullptr, SawStore))
3750
2.04k
    return nullptr;
3751
88.9k
3752
88.9k
  // Collect information about virtual register operands of MI.
3753
88.9k
  SmallVector<unsigned, 1> SrcOperandIds;
3754
508k
  for (unsigned i = 0, e = MI.getNumOperands(); i != e; 
++i419k
) {
3755
419k
    MachineOperand &MO = MI.getOperand(i);
3756
419k
    if (!MO.isReg())
3757
118k
      continue;
3758
301k
    unsigned Reg = MO.getReg();
3759
301k
    if (Reg != FoldAsLoadDefReg)
3760
211k
      continue;
3761
90.5k
    // Do not fold if we have a subreg use or a def.
3762
90.5k
    if (MO.getSubReg() || 
MO.isDef()90.3k
)
3763
148
      return nullptr;
3764
90.3k
    SrcOperandIds.push_back(i);
3765
90.3k
  }
3766
88.9k
  
if (88.8k
SrcOperandIds.empty()88.8k
)
3767
0
    return nullptr;
3768
88.8k
3769
88.8k
  // Check whether we can fold the def into SrcOperandId.
3770
88.8k
  if (MachineInstr *FoldMI = foldMemoryOperand(MI, SrcOperandIds, *DefMI)) {
3771
7.58k
    FoldAsLoadDefReg = 0;
3772
7.58k
    return FoldMI;
3773
7.58k
  }
3774
81.2k
3775
81.2k
  return nullptr;
3776
81.2k
}
3777
3778
/// Expand a single-def pseudo instruction to a two-addr
3779
/// instruction with two undef reads of the register being defined.
3780
/// This is used for mapping:
3781
///   %xmm4 = V_SET0
3782
/// to:
3783
///   %xmm4 = PXORrr undef %xmm4, undef %xmm4
3784
///
3785
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
3786
65.8k
                             const MCInstrDesc &Desc) {
3787
65.8k
  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
3788
65.8k
  unsigned Reg = MIB->getOperand(0).getReg();
3789
65.8k
  MIB->setDesc(Desc);
3790
65.8k
3791
65.8k
  // MachineInstr::addOperand() will insert explicit operands before any
3792
65.8k
  // implicit operands.
3793
65.8k
  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
3794
65.8k
  // But we don't trust that.
3795
65.8k
  assert(MIB->getOperand(1).getReg() == Reg &&
3796
65.8k
         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
3797
65.8k
  return true;
3798
65.8k
}
3799
3800
/// Expand a single-def pseudo instruction to a two-addr
3801
/// instruction with two %k0 reads.
3802
/// This is used for mapping:
3803
///   %k4 = K_SET1
3804
/// to:
3805
///   %k4 = KXNORrr %k0, %k0
3806
static bool Expand2AddrKreg(MachineInstrBuilder &MIB,
3807
310
                            const MCInstrDesc &Desc, unsigned Reg) {
3808
310
  assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
3809
310
  MIB->setDesc(Desc);
3810
310
  MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
3811
310
  return true;
3812
310
}
3813
3814
static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
3815
96
                          bool MinusOne) {
3816
96
  MachineBasicBlock &MBB = *MIB->getParent();
3817
96
  DebugLoc DL = MIB->getDebugLoc();
3818
96
  unsigned Reg = MIB->getOperand(0).getReg();
3819
96
3820
96
  // Insert the XOR.
3821
96
  BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
3822
96
      .addReg(Reg, RegState::Undef)
3823
96
      .addReg(Reg, RegState::Undef);
3824
96
3825
96
  // Turn the pseudo into an INC or DEC.
3826
96
  MIB->setDesc(TII.get(MinusOne ? 
X86::DEC32r50
:
X86::INC32r46
));
3827
96
  MIB.addReg(Reg);
3828
96
3829
96
  return true;
3830
96
}
3831
3832
static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
3833
                               const TargetInstrInfo &TII,
3834
281
                               const X86Subtarget &Subtarget) {
3835
281
  MachineBasicBlock &MBB = *MIB->getParent();
3836
281
  DebugLoc DL = MIB->getDebugLoc();
3837
281
  int64_t Imm = MIB->getOperand(1).getImm();
3838
281
  assert(Imm != 0 && "Using push/pop for 0 is not efficient.");
3839
281
  MachineBasicBlock::iterator I = MIB.getInstr();
3840
281
3841
281
  int StackAdjustment;
3842
281
3843
281
  if (Subtarget.is64Bit()) {
3844
217
    assert(MIB->getOpcode() == X86::MOV64ImmSExti8 ||
3845
217
           MIB->getOpcode() == X86::MOV32ImmSExti8);
3846
217
3847
217
    // Can't use push/pop lowering if the function might write to the red zone.
3848
217
    X86MachineFunctionInfo *X86FI =
3849
217
        MBB.getParent()->getInfo<X86MachineFunctionInfo>();
3850
217
    if (X86FI->getUsesRedZone()) {
3851
45
      MIB->setDesc(TII.get(MIB->getOpcode() ==
3852
45
                           X86::MOV32ImmSExti8 ? 
X86::MOV32ri41
:
X86::MOV64ri4
));
3853
45
      return true;
3854
45
    }
3855
172
3856
172
    // 64-bit mode doesn't have 32-bit push/pop, so use 64-bit operations and
3857
172
    // widen the register if necessary.
3858
172
    StackAdjustment = 8;
3859
172
    BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
3860
172
    MIB->setDesc(TII.get(X86::POP64r));
3861
172
    MIB->getOperand(0)
3862
172
        .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
3863
172
  } else {
3864
64
    assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
3865
64
    StackAdjustment = 4;
3866
64
    BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
3867
64
    MIB->setDesc(TII.get(X86::POP32r));
3868
64
  }
3869
281
3870
281
  // Build CFI if necessary.
3871
281
  MachineFunction &MF = *MBB.getParent();
3872
236
  const X86FrameLowering *TFL = Subtarget.getFrameLowering();
3873
236
  bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
3874
236
  bool NeedsDwarfCFI =
3875
236
      !IsWin64Prologue &&
3876
236
      (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
3877
236
  bool EmitCFI = !TFL->hasFP(MF) && 
NeedsDwarfCFI223
;
3878
236
  if (EmitCFI) {
3879
31
    TFL->BuildCFI(MBB, I, DL,
3880
31
        MCCFIInstruction::createAdjustCfaOffset(nullptr, StackAdjustment));
3881
31
    TFL->BuildCFI(MBB, std::next(I), DL,
3882
31
        MCCFIInstruction::createAdjustCfaOffset(nullptr, -StackAdjustment));
3883
31
  }
3884
236
3885
236
  return true;
3886
281
}
3887
3888
// LoadStackGuard has so far only been implemented for 64-bit MachO. Different
3889
// code sequence is needed for other targets.
3890
static void expandLoadStackGuard(MachineInstrBuilder &MIB,
3891
318
                                 const TargetInstrInfo &TII) {
3892
318
  MachineBasicBlock &MBB = *MIB->getParent();
3893
318
  DebugLoc DL = MIB->getDebugLoc();
3894
318
  unsigned Reg = MIB->getOperand(0).getReg();
3895
318
  const GlobalValue *GV =
3896
318
      cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
3897
318
  auto Flags = MachineMemOperand::MOLoad |
3898
318
               MachineMemOperand::MODereferenceable |
3899
318
               MachineMemOperand::MOInvariant;
3900
318
  MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
3901
318
      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
3902
318
  MachineBasicBlock::iterator I = MIB.getInstr();
3903
318
3904
318
  BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
3905
318
      .addReg(0).addGlobalAddress(GV, 0, X86II::MO_GOTPCREL).addReg(0)
3906
318
      .addMemOperand(MMO);
3907
318
  MIB->setDebugLoc(DL);
3908
318
  MIB->setDesc(TII.get(X86::MOV64rm));
3909
318
  MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
3910
318
}
3911
3912
274
static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) {
3913
274
  MachineBasicBlock &MBB = *MIB->getParent();
3914
274
  MachineFunction &MF = *MBB.getParent();
3915
274
  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
3916
274
  const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3917
274
  unsigned XorOp =
3918
274
      MIB->getOpcode() == X86::XOR64_FP ? 
X86::XOR64rr136
:
X86::XOR32rr138
;
3919
274
  MIB->setDesc(TII.get(XorOp));
3920
274
  MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
3921
274
  return true;
3922
274
}
3923
3924
// This is used to handle spills for 128/256-bit registers when we have AVX512,
3925
// but not VLX. If it uses an extended register we need to use an instruction
3926
// that loads the lower 128/256-bit, but is available with only AVX512F.
3927
static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
3928
                            const TargetRegisterInfo *TRI,
3929
                            const MCInstrDesc &LoadDesc,
3930
                            const MCInstrDesc &BroadcastDesc,
3931
128
                            unsigned SubIdx) {
3932
128
  unsigned DestReg = MIB->getOperand(0).getReg();
3933
128
  // Check if DestReg is XMM16-31 or YMM16-31.
3934
128
  if (TRI->getEncodingValue(DestReg) < 16) {
3935
128
    // We can use a normal VEX encoded load.
3936
128
    MIB->setDesc(LoadDesc);
3937
128
  } else {
3938
0
    // Use a 128/256-bit VBROADCAST instruction.
3939
0
    MIB->setDesc(BroadcastDesc);
3940
0
    // Change the destination to a 512-bit register.
3941
0
    DestReg = TRI->getMatchingSuperReg(DestReg, SubIdx, &X86::VR512RegClass);
3942
0
    MIB->getOperand(0).setReg(DestReg);
3943
0
  }
3944
128
  return true;
3945
128
}
3946
3947
// This is used to handle spills for 128/256-bit registers when we have AVX512,
3948
// but not VLX. If it uses an extended register we need to use an instruction
3949
// that stores the lower 128/256-bit, but is available with only AVX512F.
3950
static bool expandNOVLXStore(MachineInstrBuilder &MIB,
3951
                             const TargetRegisterInfo *TRI,
3952
                             const MCInstrDesc &StoreDesc,
3953
                             const MCInstrDesc &ExtractDesc,
3954
159
                             unsigned SubIdx) {
3955
159
  unsigned SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
3956
159
  // Check if DestReg is XMM16-31 or YMM16-31.
3957
159
  if (TRI->getEncodingValue(SrcReg) < 16) {
3958
159
    // We can use a normal VEX encoded store.
3959
159
    MIB->setDesc(StoreDesc);
3960
159
  } else {
3961
0
    // Use a VEXTRACTF instruction.
3962
0
    MIB->setDesc(ExtractDesc);
3963
0
    // Change the destination to a 512-bit register.
3964
0
    SrcReg = TRI->getMatchingSuperReg(SrcReg, SubIdx, &X86::VR512RegClass);
3965
0
    MIB->getOperand(X86::AddrNumOperands).setReg(SrcReg);
3966
0
    MIB.addImm(0x0); // Append immediate to extract from the lower bits.
3967
0
  }
3968
159
3969
159
  return true;
3970
159
}
3971
3972
36
static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
3973
36
  MIB->setDesc(Desc);
3974
36
  int64_t ShiftAmt = MIB->getOperand(2).getImm();
3975
36
  // Temporarily remove the immediate so we can add another source register.
3976
36
  MIB->RemoveOperand(2);
3977
36
  // Add the register. Don't copy the kill flag if there is one.
3978
36
  MIB.addReg(MIB->getOperand(1).getReg(),
3979
36
             getUndefRegState(MIB->getOperand(1).isUndef()));
3980
36
  // Add back the immediate.
3981
36
  MIB.addImm(ShiftAmt);
3982
36
  return true;
3983
36
}
3984
3985
513k
bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
3986
513k
  bool HasAVX = Subtarget.hasAVX();
3987
513k
  MachineInstrBuilder MIB(*MI.getParent()->getParent(), MI);
3988
513k
  switch (MI.getOpcode()) {
3989
513k
  case X86::MOV32r0:
3990
49.0k
    return Expand2AddrUndef(MIB, get(X86::XOR32rr));
3991
513k
  case X86::MOV32r1:
3992
46
    return expandMOV32r1(MIB, *this, /*MinusOne=*/ false);
3993
513k
  case X86::MOV32r_1:
3994
50
    return expandMOV32r1(MIB, *this, /*MinusOne=*/ true);
3995
513k
  case X86::MOV32ImmSExti8:
3996
281
  case X86::MOV64ImmSExti8:
3997
281
    return ExpandMOVImmSExti8(MIB, *this, Subtarget);
3998
281
  case X86::SETB_C8r:
3999
15
    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
4000
281
  case X86::SETB_C16r:
4001
6
    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
4002
281
  case X86::SETB_C32r:
4003
70
    return Expand2AddrUndef(MIB, get(X86::SBB32rr));
4004
281
  case X86::SETB_C64r:
4005
33
    return Expand2AddrUndef(MIB, get(X86::SBB64rr));
4006
281
  case X86::MMX_SET0:
4007
63
    return Expand2AddrUndef(MIB, get(X86::MMX_PXORirr));
4008
9.80k
  case X86::V_SET0:
4009
9.80k
  case X86::FsFLD0SS:
4010
9.80k
  case X86::FsFLD0SD:
4011
9.80k
    return Expand2AddrUndef(MIB, get(HasAVX ? 
X86::VXORPSrr3.50k
:
X86::XORPSrr6.30k
));
4012
9.80k
  case X86::AVX_SET0: {
4013
876
    assert(HasAVX && "AVX not supported");
4014
876
    const TargetRegisterInfo *TRI = &getRegisterInfo();
4015
876
    unsigned SrcReg = MIB->getOperand(0).getReg();
4016
876
    unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4017
876
    MIB->getOperand(0).setReg(XReg);
4018
876
    Expand2AddrUndef(MIB, get(X86::VXORPSrr));
4019
876
    MIB.addReg(SrcReg, RegState::ImplicitDefine);
4020
876
    return true;
4021
9.80k
  }
4022
9.80k
  case X86::AVX512_128_SET0:
4023
1.45k
  case X86::AVX512_FsFLD0SS:
4024
1.45k
  case X86::AVX512_FsFLD0SD: {
4025
1.45k
    bool HasVLX = Subtarget.hasVLX();
4026
1.45k
    unsigned SrcReg = MIB->getOperand(0).getReg();
4027
1.45k
    const TargetRegisterInfo *TRI = &getRegisterInfo();
4028
1.45k
    if (HasVLX || 
TRI->getEncodingValue(SrcReg) < 16510
)
4029
1.45k
      return Expand2AddrUndef(MIB,
4030
1.45k
                              get(HasVLX ? 
X86::VPXORDZ128rr943
:
X86::VXORPSrr510
));
4031
0
    // Extended register without VLX. Use a larger XOR.
4032
0
    SrcReg =
4033
0
        TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
4034
0
    MIB->getOperand(0).setReg(SrcReg);
4035
0
    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4036
0
  }
4037
1.62k
  case X86::AVX512_256_SET0:
4038
1.62k
  case X86::AVX512_512_SET0: {
4039
1.62k
    bool HasVLX = Subtarget.hasVLX();
4040
1.62k
    unsigned SrcReg = MIB->getOperand(0).getReg();
4041
1.62k
    const TargetRegisterInfo *TRI = &getRegisterInfo();
4042
1.62k
    if (HasVLX || 
TRI->getEncodingValue(SrcReg) < 16508
) {
4043
1.62k
      unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
4044
1.62k
      MIB->getOperand(0).setReg(XReg);
4045
1.62k
      Expand2AddrUndef(MIB,
4046
1.62k
                       get(HasVLX ? 
X86::VPXORDZ128rr1.12k
:
X86::VXORPSrr507
));
4047
1.62k
      MIB.addReg(SrcReg, RegState::ImplicitDefine);
4048
1.62k
      return true;
4049
1.62k
    }
4050
1
    if (MI.getOpcode() == X86::AVX512_256_SET0) {
4051
1
      // No VLX so we must reference a zmm.
4052
1
      unsigned ZReg =
4053
1
        TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
4054
1
      MIB->getOperand(0).setReg(ZReg);
4055
1
    }
4056
1
    return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
4057
1
  }
4058
2.42k
  case X86::V_SETALLONES:
4059
2.42k
    return Expand2AddrUndef(MIB, get(HasAVX ? 
X86::VPCMPEQDrr1.23k
:
X86::PCMPEQDrr1.18k
));
4060
399
  case X86::AVX2_SETALLONES:
4061
399
    return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
4062
1
  case X86::AVX1_SETALLONES: {
4063
0
    unsigned Reg = MIB->getOperand(0).getReg();
4064
0
    // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
4065
0
    MIB->setDesc(get(X86::VCMPPSYrri));
4066
0
    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
4067
0
    return true;
4068
1
  }
4069
118
  case X86::AVX512_512_SETALLONES: {
4070
118
    unsigned Reg = MIB->getOperand(0).getReg();
4071
118
    MIB->setDesc(get(X86::VPTERNLOGDZrri));
4072
118
    // VPTERNLOGD needs 3 register inputs and an immediate.
4073
118
    // 0xff will return 1s for any input.
4074
118
    MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef)
4075
118
       .addReg(Reg, RegState::Undef).addImm(0xff);
4076
118
    return true;
4077
1
  }
4078
298
  case X86::AVX512_512_SEXT_MASK_32:
4079
298
  case X86::AVX512_512_SEXT_MASK_64: {
4080
298
    unsigned Reg = MIB->getOperand(0).getReg();
4081
298
    unsigned MaskReg = MIB->getOperand(1).getReg();
4082
298
    unsigned MaskState = getRegState(MIB->getOperand(1));
4083
298
    unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
4084
257
                   
X86::VPTERNLOGQZrrikz41
: X86::VPTERNLOGDZrrikz;
4085
298
    MI.RemoveOperand(1);
4086
298
    MIB->setDesc(get(Opc));
4087
298
    // VPTERNLOG needs 3 register inputs and an immediate.
4088
298
    // 0xff will return 1s for any input.
4089
298
    MIB.addReg(Reg, RegState::Undef).addReg(MaskReg, MaskState)
4090
298
       .addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xff);
4091
298
    return true;
4092
298
  }
4093
298
  case X86::VMOVAPSZ128rm_NOVLX:
4094
79
    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSrm),
4095
79
                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4096
298
  case X86::VMOVUPSZ128rm_NOVLX:
4097
20
    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSrm),
4098
20
                           get(X86::VBROADCASTF32X4rm), X86::sub_xmm);
4099
298
  case X86::VMOVAPSZ256rm_NOVLX:
4100
0
    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVAPSYrm),
4101
0
                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4102
298
  case X86::VMOVUPSZ256rm_NOVLX:
4103
29
    return expandNOVLXLoad(MIB, &getRegisterInfo(), get(X86::VMOVUPSYrm),
4104
29
                           get(X86::VBROADCASTF64X4rm), X86::sub_ymm);
4105
298
  case X86::VMOVAPSZ128mr_NOVLX:
4106
114
    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSmr),
4107
114
                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4108
298
  case X86::VMOVUPSZ128mr_NOVLX:
4109
18
    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSmr),
4110
18
                            get(X86::VEXTRACTF32x4Zmr), X86::sub_xmm);
4111
298
  case X86::VMOVAPSZ256mr_NOVLX:
4112
0
    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVAPSYmr),
4113
0
                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4114
298
  case X86::VMOVUPSZ256mr_NOVLX:
4115
27
    return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
4116
27
                            get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
4117
15.1k
  case X86::MOV32ri64: {
4118
15.1k
    unsigned Reg = MIB->getOperand(0).getReg();
4119
15.1k
    unsigned Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
4120
15.1k
    MI.setDesc(get(X86::MOV32ri));
4121
15.1k
    MIB->getOperand(0).setReg(Reg32);
4122
15.1k
    MIB.addReg(Reg, RegState::ImplicitDefine);
4123
15.1k
    return true;
4124
298
  }
4125
298
4126
298
  // KNL does not recognize dependency-breaking idioms for mask registers,
4127
298
  // so kxnor %k1, %k1, %k2 has a RAW dependence on %k1.
4128
298
  // Using %k0 as the undef input register is a performance heuristic based
4129
298
  // on the assumption that %k0 is used less frequently than the other mask
4130
298
  // registers, since it is not usable as a write mask.
4131
298
  // FIXME: A more advanced approach would be to choose the best input mask
4132
298
  // register based on context.
4133
298
  
case X86::KSET0W: return Expand2AddrKreg(MIB, get(X86::KXORWrr), X86::K0)11
;
4134
298
  
case X86::KSET0D: return Expand2AddrKreg(MIB, get(X86::KXORDrr), X86::K0)0
;
4135
298
  
case X86::KSET0Q: return Expand2AddrKreg(MIB, get(X86::KXORQrr), X86::K0)0
;
4136
298
  
case X86::KSET1W: return Expand2AddrKreg(MIB, get(X86::KXNORWrr), X86::K0)274
;
4137
298
  
case X86::KSET1D: return Expand2AddrKreg(MIB, get(X86::KXNORDrr), X86::K0)17
;
4138
298
  
case X86::KSET1Q: return Expand2AddrKreg(MIB, get(X86::KXNORQrr), X86::K0)8
;
4139
318
  case TargetOpcode::LOAD_STACK_GUARD:
4140
318
    expandLoadStackGuard(MIB, *this);
4141
318
    return true;
4142
298
  case X86::XOR64_FP:
4143
274
  case X86::XOR32_FP:
4144
274
    return expandXorFP(MIB, *this);
4145
274
  
case X86::SHLDROT32ri: return expandSHXDROT(MIB, get(X86::SHLD32rri8))16
;
4146
274
  
case X86::SHLDROT64ri: return expandSHXDROT(MIB, get(X86::SHLD64rri8))8
;
4147
274
  
case X86::SHRDROT32ri: return expandSHXDROT(MIB, get(X86::SHRD32rri8))8
;
4148
274
  
case X86::SHRDROT64ri: return expandSHXDROT(MIB, get(X86::SHRD64rri8))4
;
4149
274
  
case X86::ADD8rr_DB: MIB->setDesc(get(X86::OR8rr)); break137
;
4150
274
  
case X86::ADD16rr_DB: MIB->setDesc(get(X86::OR16rr)); break0
;
4151
751
  case X86::ADD32rr_DB:   MIB->setDesc(get(X86::OR32rr));   break;
4152
1.49k
  case X86::ADD64rr_DB:   MIB->setDesc(get(X86::OR64rr));   break;
4153
274
  
case X86::ADD8ri_DB: MIB->setDesc(get(X86::OR8ri)); break20
;
4154
274
  
case X86::ADD16ri_DB: MIB->setDesc(get(X86::OR16ri)); break0
;
4155
274
  
case X86::ADD32ri_DB: MIB->setDesc(get(X86::OR32ri)); break202
;
4156
274
  
case X86::ADD64ri32_DB: MIB->setDesc(get(X86::OR64ri32)); break12
;
4157
274
  
case X86::ADD16ri8_DB: MIB->setDesc(get(X86::OR16ri8)); break0
;
4158
274
  
case X86::ADD32ri8_DB: MIB->setDesc(get(X86::OR32ri8)); break62
;
4159
300
  case X86::ADD64ri8_DB:  MIB->setDesc(get(X86::OR64ri8));  break;
4160
430k
  }
4161
430k
  return false;
4162
430k
}
4163
4164
/// Return true for all instructions that only update
4165
/// the first 32 or 64-bits of the destination register and leave the rest
4166
/// unmodified. This can be used to avoid folding loads if the instructions
4167
/// only update part of the destination register, and the non-updated part is
4168
/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
4169
/// instructions breaks the partial register dependency and it can improve
4170
/// performance. e.g.:
4171
///
4172
///   movss (%rdi), %xmm0
4173
///   cvtss2sd %xmm0, %xmm0
4174
///
4175
/// Instead of
4176
///   cvtss2sd (%rdi), %xmm0
4177
///
4178
/// FIXME: This should be turned into a TSFlags.
4179
///
4180
static bool hasPartialRegUpdate(unsigned Opcode,
4181
                                const X86Subtarget &Subtarget,
4182
1.73M
                                bool ForLoadFold = false) {
4183
1.73M
  switch (Opcode) {
4184
1.73M
  case X86::CVTSI2SSrr:
4185
1.04k
  case X86::CVTSI2SSrm:
4186
1.04k
  case X86::CVTSI642SSrr:
4187
1.04k
  case X86::CVTSI642SSrm:
4188
1.04k
  case X86::CVTSI2SDrr:
4189
1.04k
  case X86::CVTSI2SDrm:
4190
1.04k
  case X86::CVTSI642SDrr:
4191
1.04k
  case X86::CVTSI642SDrm:
4192
1.04k
    // Load folding won't effect the undef register update since the input is
4193
1.04k
    // a GPR.
4194
1.04k
    return !ForLoadFold;
4195
1.04k
  case X86::CVTSD2SSrr:
4196
684
  case X86::CVTSD2SSrm:
4197
684
  case X86::CVTSS2SDrr:
4198
684
  case X86::CVTSS2SDrm:
4199
684
  case X86::MOVHPDrm:
4200
684
  case X86::MOVHPSrm:
4201
684
  case X86::MOVLPDrm:
4202
684
  case X86::MOVLPSrm:
4203
684
  case X86::RCPSSr:
4204
684
  case X86::RCPSSm:
4205
684
  case X86::RCPSSr_Int:
4206
684
  case X86::RCPSSm_Int:
4207
684
  case X86::ROUNDSDr:
4208
684
  case X86::ROUNDSDm:
4209
684
  case X86::ROUNDSSr:
4210
684
  case X86::ROUNDSSm:
4211
684
  case X86::RSQRTSSr:
4212
684
  case X86::RSQRTSSm:
4213
684
  case X86::RSQRTSSr_Int:
4214
684
  case X86::RSQRTSSm_Int:
4215
684
  case X86::SQRTSSr:
4216
684
  case X86::SQRTSSm:
4217
684
  case X86::SQRTSSr_Int:
4218
684
  case X86::SQRTSSm_Int:
4219
684
  case X86::SQRTSDr:
4220
684
  case X86::SQRTSDm:
4221
684
  case X86::SQRTSDr_Int:
4222
684
  case X86::SQRTSDm_Int:
4223
684
    return true;
4224
684
  // GPR
4225
684
  case X86::POPCNT32rm:
4226
141
  case X86::POPCNT32rr:
4227
141
  case X86::POPCNT64rm:
4228
141
  case X86::POPCNT64rr:
4229
141
    return Subtarget.hasPOPCNTFalseDeps();
4230
462
  case X86::LZCNT32rm:
4231
462
  case X86::LZCNT32rr:
4232
462
  case X86::LZCNT64rm:
4233
462
  case X86::LZCNT64rr:
4234
462
  case X86::TZCNT32rm:
4235
462
  case X86::TZCNT32rr:
4236
462
  case X86::TZCNT64rm:
4237
462
  case X86::TZCNT64rr:
4238
462
    return Subtarget.hasLZCNTFalseDeps();
4239
1.72M
  }
4240
1.72M
4241
1.72M
  return false;
4242
1.72M
}
4243
4244
/// Inform the BreakFalseDeps pass how many idle
4245
/// instructions we would like before a partial register update.
4246
unsigned X86InstrInfo::getPartialRegUpdateClearance(
4247
    const MachineInstr &MI, unsigned OpNum,
4248
1.50M
    const TargetRegisterInfo *TRI) const {
4249
1.50M
  if (OpNum != 0 || 
!hasPartialRegUpdate(MI.getOpcode(), Subtarget)1.42M
)
4250
1.50M
    return 0;
4251
1.92k
4252
1.92k
  // If MI is marked as reading Reg, the partial register update is wanted.
4253
1.92k
  const MachineOperand &MO = MI.getOperand(0);
4254
1.92k
  unsigned Reg = MO.getReg();
4255
1.92k
  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
4256
0
    if (MO.readsReg() || MI.readsVirtualRegister(Reg))
4257
0
      return 0;
4258
1.92k
  } else {
4259
1.92k
    if (MI.readsRegister(Reg, TRI))
4260
490
      return 0;
4261
1.43k
  }
4262
1.43k
4263
1.43k
  // If any instructions in the clearance range are reading Reg, insert a
4264
1.43k
  // dependency breaking instruction, which is inexpensive and is likely to
4265
1.43k
  // be hidden in other instruction's cycles.
4266
1.43k
  return PartialRegUpdateClearance;
4267
1.43k
}
4268
4269
// Return true for any instruction the copies the high bits of the first source
4270
// operand into the unused high bits of the destination operand.
4271
2.84M
static bool hasUndefRegUpdate(unsigned Opcode, bool ForLoadFold = false) {
4272
2.84M
  switch (Opcode) {
4273
2.84M
  case X86::VCVTSI2SSrr:
4274
1.86k
  case X86::VCVTSI2SSrm:
4275
1.86k
  case X86::VCVTSI2SSrr_Int:
4276
1.86k
  case X86::VCVTSI2SSrm_Int:
4277
1.86k
  case X86::VCVTSI642SSrr:
4278
1.86k
  case X86::VCVTSI642SSrm:
4279
1.86k
  case X86::VCVTSI642SSrr_Int:
4280
1.86k
  case X86::VCVTSI642SSrm_Int:
4281
1.86k
  case X86::VCVTSI2SDrr:
4282
1.86k
  case X86::VCVTSI2SDrm:
4283
1.86k
  case X86::VCVTSI2SDrr_Int:
4284
1.86k
  case X86::VCVTSI2SDrm_Int:
4285
1.86k
  case X86::VCVTSI642SDrr:
4286
1.86k
  case X86::VCVTSI642SDrm:
4287
1.86k
  case X86::VCVTSI642SDrr_Int:
4288
1.86k
  case X86::VCVTSI642SDrm_Int:
4289
1.86k
  // AVX-512
4290
1.86k
  case X86::VCVTSI2SSZrr:
4291
1.86k
  case X86::VCVTSI2SSZrm:
4292
1.86k
  case X86::VCVTSI2SSZrr_Int:
4293
1.86k
  case X86::VCVTSI2SSZrrb_Int:
4294
1.86k
  case X86::VCVTSI2SSZrm_Int:
4295
1.86k
  case X86::VCVTSI642SSZrr:
4296
1.86k
  case X86::VCVTSI642SSZrm:
4297
1.86k
  case X86::VCVTSI642SSZrr_Int:
4298
1.86k
  case X86::VCVTSI642SSZrrb_Int:
4299
1.86k
  case X86::VCVTSI642SSZrm_Int:
4300
1.86k
  case X86::VCVTSI2SDZrr:
4301
1.86k
  case X86::VCVTSI2SDZrm:
4302
1.86k
  case X86::VCVTSI2SDZrr_Int:
4303
1.86k
  case X86::VCVTSI2SDZrm_Int:
4304
1.86k
  case X86::VCVTSI642SDZrr:
4305
1.86k
  case X86::VCVTSI642SDZrm:
4306
1.86k
  case X86::VCVTSI642SDZrr_Int:
4307
1.86k
  case X86::VCVTSI642SDZrrb_Int:
4308
1.86k
  case X86::VCVTSI642SDZrm_Int:
4309
1.86k
  case X86::VCVTUSI2SSZrr:
4310
1.86k
  case X86::VCVTUSI2SSZrm:
4311
1.86k
  case X86::VCVTUSI2SSZrr_Int:
4312
1.86k
  case X86::VCVTUSI2SSZrrb_Int:
4313
1.86k
  case X86::VCVTUSI2SSZrm_Int:
4314
1.86k
  case X86::VCVTUSI642SSZrr:
4315
1.86k
  case X86::VCVTUSI642SSZrm:
4316
1.86k
  case X86::VCVTUSI642SSZrr_Int:
4317
1.86k
  case X86::VCVTUSI642SSZrrb_Int:
4318
1.86k
  case X86::VCVTUSI642SSZrm_Int:
4319
1.86k
  case X86::VCVTUSI2SDZrr:
4320
1.86k
  case X86::VCVTUSI2SDZrm:
4321
1.86k
  case X86::VCVTUSI2SDZrr_Int:
4322
1.86k
  case X86::VCVTUSI2SDZrm_Int:
4323
1.86k
  case X86::VCVTUSI642SDZrr:
4324
1.86k
  case X86::VCVTUSI642SDZrm:
4325
1.86k
  case X86::VCVTUSI642SDZrr_Int:
4326
1.86k
  case X86::VCVTUSI642SDZrrb_Int:
4327
1.86k
  case X86::VCVTUSI642SDZrm_Int:
4328
1.86k
    // Load folding won't effect the undef register update since the input is
4329
1.86k
    // a GPR.
4330
1.86k
    return !ForLoadFold;
4331
1.86k
  case X86::VCVTSD2SSrr:
4332
966
  case X86::VCVTSD2SSrm:
4333
966
  case X86::VCVTSD2SSrr_Int:
4334
966
  case X86::VCVTSD2SSrm_Int:
4335
966
  case X86::VCVTSS2SDrr:
4336
966
  case X86::VCVTSS2SDrm:
4337
966
  case X86::VCVTSS2SDrr_Int:
4338
966
  case X86::VCVTSS2SDrm_Int:
4339
966
  case X86::VRCPSSr:
4340
966
  case X86::VRCPSSr_Int:
4341
966
  case X86::VRCPSSm:
4342
966
  case X86::VRCPSSm_Int:
4343
966
  case X86::VROUNDSDr:
4344
966
  case X86::VROUNDSDm:
4345
966
  case X86::VROUNDSDr_Int:
4346
966
  case X86::VROUNDSDm_Int:
4347
966
  case X86::VROUNDSSr:
4348
966
  case X86::VROUNDSSm:
4349
966
  case X86::VROUNDSSr_Int:
4350
966
  case X86::VROUNDSSm_Int:
4351
966
  case X86::VRSQRTSSr:
4352
966
  case X86::VRSQRTSSr_Int:
4353
966
  case X86::VRSQRTSSm:
4354
966
  case X86::VRSQRTSSm_Int:
4355
966
  case X86::VSQRTSSr:
4356
966
  case X86::VSQRTSSr_Int:
4357
966
  case X86::VSQRTSSm:
4358
966
  case X86::VSQRTSSm_Int:
4359
966
  case X86::VSQRTSDr:
4360
966
  case X86::VSQRTSDr_Int:
4361
966
  case X86::VSQRTSDm:
4362
966
  case X86::VSQRTSDm_Int:
4363
966
  // AVX-512
4364
966
  case X86::VCVTSD2SSZrr:
4365
966
  case X86::VCVTSD2SSZrr_Int:
4366
966
  case X86::VCVTSD2SSZrrb_Int:
4367
966
  case X86::VCVTSD2SSZrm:
4368
966
  case X86::VCVTSD2SSZrm_Int:
4369
966
  case X86::VCVTSS2SDZrr:
4370
966
  case X86::VCVTSS2SDZrr_Int:
4371
966
  case X86::VCVTSS2SDZrrb_Int:
4372
966
  case X86::VCVTSS2SDZrm:
4373
966
  case X86::VCVTSS2SDZrm_Int:
4374
966
  case X86::VGETEXPSDZr:
4375
966
  case X86::VGETEXPSDZrb:
4376
966
  case X86::VGETEXPSDZm:
4377
966
  case X86::VGETEXPSSZr:
4378
966
  case X86::VGETEXPSSZrb:
4379
966
  case X86::VGETEXPSSZm:
4380
966
  case X86::VGETMANTSDZrri:
4381
966
  case X86::VGETMANTSDZrrib:
4382
966
  case X86::VGETMANTSDZrmi:
4383
966
  case X86::VGETMANTSSZrri:
4384
966
  case X86::VGETMANTSSZrrib:
4385
966
  case X86::VGETMANTSSZrmi:
4386
966
  case X86::VRNDSCALESDZr:
4387
966
  case X86::VRNDSCALESDZr_Int:
4388
966
  case X86::VRNDSCALESDZrb_Int:
4389
966
  case X86::VRNDSCALESDZm:
4390
966
  case X86::VRNDSCALESDZm_Int:
4391
966
  case X86::VRNDSCALESSZr:
4392
966
  case X86::VRNDSCALESSZr_Int:
4393
966
  case X86::VRNDSCALESSZrb_Int:
4394
966
  case X86::VRNDSCALESSZm:
4395
966
  case X86::VRNDSCALESSZm_Int:
4396
966
  case X86::VRCP14SDZrr:
4397
966
  case X86::VRCP14SDZrm:
4398
966
  case X86::VRCP14SSZrr:
4399
966
  case X86::VRCP14SSZrm:
4400
966
  case X86::VRCP28SDZr:
4401
966
  case X86::VRCP28SDZrb:
4402
966
  case X86::VRCP28SDZm:
4403
966
  case X86::VRCP28SSZr:
4404
966
  case X86::VRCP28SSZrb:
4405
966
  case X86::VRCP28SSZm:
4406
966
  case X86::VREDUCESSZrmi:
4407
966
  case X86::VREDUCESSZrri:
4408
966
  case X86::VREDUCESSZrrib:
4409
966
  case X86::VRSQRT14SDZrr:
4410
966
  case X86::VRSQRT14SDZrm:
4411
966
  case X86::VRSQRT14SSZrr:
4412
966
  case X86::VRSQRT14SSZrm:
4413
966
  case X86::VRSQRT28SDZr:
4414
966
  case X86::VRSQRT28SDZrb:
4415
966
  case X86::VRSQRT28SDZm:
4416
966
  case X86::VRSQRT28SSZr:
4417
966
  case X86::VRSQRT28SSZrb:
4418
966
  case X86::VRSQRT28SSZm:
4419
966
  case X86::VSQRTSSZr:
4420
966
  case X86::VSQRTSSZr_Int:
4421
966
  case X86::VSQRTSSZrb_Int:
4422
966
  case X86::VSQRTSSZm:
4423
966
  case X86::VSQRTSSZm_Int:
4424
966
  case X86::VSQRTSDZr:
4425
966
  case X86::VSQRTSDZr_Int:
4426
966
  case X86::VSQRTSDZrb_Int:
4427
966
  case X86::VSQRTSDZm:
4428
966
  case X86::VSQRTSDZm_Int:
4429
966
    return true;
4430
2.84M
  }
4431
2.84M
4432
2.84M
  return false;
4433
2.84M
}
4434
4435
/// Inform the BreakFalseDeps pass how many idle instructions we would like
4436
/// before certain undef register reads.
4437
///
4438
/// This catches the VCVTSI2SD family of instructions:
4439
///
4440
/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
4441
///
4442
/// We should to be careful *not* to catch VXOR idioms which are presumably
4443
/// handled specially in the pipeline:
4444
///
4445
/// vxorps undef %xmm1, undef %xmm1, %xmm1
4446
///
4447
/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
4448
/// high bits that are passed-through are not live.
4449
unsigned
4450
X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
4451
2.54M
                                   const TargetRegisterInfo *TRI) const {
4452
2.54M
  if (!hasUndefRegUpdate(MI.getOpcode()))
4453
2.53M
    return 0;
4454
2.62k
4455
2.62k
  // Set the OpNum parameter to the first source operand.
4456
2.62k
  OpNum = 1;
4457
2.62k
4458
2.62k
  const MachineOperand &MO = MI.getOperand(OpNum);
4459
2.62k
  if (MO.isUndef() && 
TargetRegisterInfo::isPhysicalRegister(MO.getReg())2.30k
) {
4460
2.30k
    return UndefRegClearance;
4461
2.30k
  }
4462
324
  return 0;
4463
324
}
4464
4465
void X86InstrInfo::breakPartialRegDependency(
4466
709
    MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
4467
709
  unsigned Reg = MI.getOperand(OpNum).getReg();
4468
709
  // If MI kills this register, the false dependence is already broken.
4469
709
  if (MI.killsRegister(Reg, TRI))
4470
0
    return;
4471
709
4472
709
  if (X86::VR128RegClass.contains(Reg)) {
4473
496
    // These instructions are all floating point domain, so xorps is the best
4474
496
    // choice.
4475
496
    unsigned Opc = Subtarget.hasAVX() ? 
X86::VXORPSrr13
:
X86::XORPSrr483
;
4476
496
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
4477
496
        .addReg(Reg, RegState::Undef)
4478
496
        .addReg(Reg, RegState::Undef);
4479
496
    MI.addRegisterKilled(Reg, TRI, true);
4480
496
  } else 
if (213
X86::VR256RegClass.contains(Reg)213
) {
4481
0
    // Use vxorps to clear the full ymm register.
4482
0
    // It wants to read and write the xmm sub-register.
4483
0
    unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
4484
0
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
4485
0
        .addReg(XReg, RegState::Undef)
4486
0
        .addReg(XReg, RegState::Undef)
4487
0
        .addReg(Reg, RegState::ImplicitDefine);
4488
0
    MI.addRegisterKilled(Reg, TRI, true);
4489
213
  } else if (X86::GR64RegClass.contains(Reg)) {
4490
165
    // Using XOR32rr because it has shorter encoding and zeros up the upper bits
4491
165
    // as well.
4492
165
    unsigned XReg = TRI->getSubReg(Reg, X86::sub_32bit);
4493
165
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), XReg)
4494
165
        .addReg(XReg, RegState::Undef)
4495
165
        .addReg(XReg, RegState::Undef)
4496
165
        .addReg(Reg, RegState::ImplicitDefine);
4497
165
    MI.addRegisterKilled(Reg, TRI, true);
4498
165
  } else 
if (48
X86::GR32RegClass.contains(Reg)48
) {
4499
48
    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::XOR32rr), Reg)
4500
48
        .addReg(Reg, RegState::Undef)
4501
48
        .addReg(Reg, RegState::Undef);
4502
48
    MI.addRegisterKilled(Reg, TRI, true);
4503
48
  }
4504
709
}
4505
4506
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
4507
18.8k
                        int PtrOffset = 0) {
4508
18.8k
  unsigned NumAddrOps = MOs.size();
4509
18.8k
4510
18.8k
  if (NumAddrOps < 4) {
4511
11.7k
    // FrameIndex only - add an immediate offset (whether its zero or not).
4512
23.4k
    for (unsigned i = 0; i != NumAddrOps; 
++i11.7k
)
4513
11.7k
      MIB.add(MOs[i]);
4514
11.7k
    addOffset(MIB, PtrOffset);
4515
11.7k
  } else {
4516
7.16k
    // General Memory Addressing - we need to add any offset to an existing
4517
7.16k
    // offset.
4518
7.16k
    assert(MOs.size() == 5 && "Unexpected memory operand list length");
4519
42.9k
    for (unsigned i = 0; i != NumAddrOps; 
++i35.8k
) {
4520
35.8k
      const MachineOperand &MO = MOs[i];
4521
35.8k
      if (i == 3 && 
PtrOffset != 07.16k
) {
4522
26
        MIB.addDisp(MO, PtrOffset);
4523
35.7k
      } else {
4524
35.7k
        MIB.add(MO);
4525
35.7k
      }
4526
35.8k
    }
4527
7.16k
  }
4528
18.8k
}
4529
4530
static void updateOperandRegConstraints(MachineFunction &MF,
4531
                                        MachineInstr &NewMI,
4532
18.5k
                                        const TargetInstrInfo &TII) {
4533
18.5k
  MachineRegisterInfo &MRI = MF.getRegInfo();
4534
18.5k
  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
4535
18.5k
4536
133k
  for (int Idx : llvm::seq<int>(0, NewMI.getNumOperands())) {
4537
133k
    MachineOperand &MO = NewMI.getOperand(Idx);
4538
133k
    // We only need to update constraints on virtual register operands.
4539
133k
    if (!MO.isReg())
4540
53.8k
      continue;
4541
79.7k
    unsigned Reg = MO.getReg();
4542
79.7k
    if (!TRI.isVirtualRegister(Reg))
4543
53.1k
      continue;
4544
26.5k
4545
26.5k
    auto *NewRC = MRI.constrainRegClass(
4546
26.5k
        Reg, TII.getRegClass(NewMI.getDesc(), Idx, &TRI, MF));
4547
26.5k
    if (!NewRC) {
4548
489
      LLVM_DEBUG(
4549
489
          dbgs() << "WARNING: Unable to update register constraint for operand "
4550
489
                 << Idx << " of instruction:\n";
4551
489
          NewMI.dump(); dbgs() << "\n");
4552
489
    }
4553
26.5k
  }
4554
18.5k
}
4555
4556
static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
4557
                                     ArrayRef<MachineOperand> MOs,
4558
                                     MachineBasicBlock::iterator InsertPt,
4559
                                     MachineInstr &MI,
4560
523
                                     const TargetInstrInfo &TII) {
4561
523
  // Create the base instruction with the memory operand as the first part.
4562
523
  // Omit the implicit operands, something BuildMI can't do.
4563
523
  MachineInstr *NewMI =
4564
523
      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
4565
523
  MachineInstrBuilder MIB(MF, NewMI);
4566
523
  addOperands(MIB, MOs);
4567
523
4568
523
  // Loop over the rest of the ri operands, converting them over.
4569
523
  unsigned NumOps = MI.getDesc().getNumOperands() - 2;
4570
1.01k
  for (unsigned i = 0; i != NumOps; 
++i489
) {
4571
489
    MachineOperand &MO = MI.getOperand(i + 2);
4572
489
    MIB.add(MO);
4573
489
  }
4574
1.28k
  for (unsigned i = NumOps + 2, e = MI.getNumOperands(); i != e; 
++i760
) {
4575
760
    MachineOperand &MO = MI.getOperand(i);
4576
760
    MIB.add(MO);
4577
760
  }
4578
523
4579
523
  updateOperandRegConstraints(MF, *NewMI, TII);
4580
523
4581
523
  MachineBasicBlock *MBB = InsertPt->getParent();
4582
523
  MBB->insert(InsertPt, NewMI);
4583
523
4584
523
  return MIB;
4585
523
}
4586
4587
static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
4588
                              unsigned OpNo, ArrayRef<MachineOperand> MOs,
4589
                              MachineBasicBlock::iterator InsertPt,
4590
                              MachineInstr &MI, const TargetInstrInfo &TII,
4591
18.0k
                              int PtrOffset = 0) {
4592
18.0k
  // Omit the implicit operands, something BuildMI can't do.
4593
18.0k
  MachineInstr *NewMI =
4594
18.0k
      MF.CreateMachineInstr(TII.get(Opcode), MI.getDebugLoc(), true);
4595
18.0k
  MachineInstrBuilder MIB(MF, NewMI);
4596
18.0k
4597
75.5k
  for (unsigned i = 0, e = MI.getNumOperands(); i != e; 
++i57.4k
) {
4598
57.4k
    MachineOperand &MO = MI.getOperand(i);
4599
57.4k
    if (i == OpNo) {
4600
18.0k
      assert(MO.isReg() && "Expected to fold into reg operand!");
4601
18.0k
      addOperands(MIB, MOs, PtrOffset);
4602
39.3k
    } else {
4603
39.3k
      MIB.add(MO);
4604
39.3k
    }
4605
57.4k
  }
4606
18.0k
4607
18.0k
  updateOperandRegConstraints(MF, *NewMI, TII);
4608
18.0k
4609
18.0k
  MachineBasicBlock *MBB = InsertPt->getParent();
4610
18.0k
  MBB->insert(InsertPt, NewMI);
4611
18.0k
4612
18.0k
  return MIB;
4613
18.0k
}
4614
4615
static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
4616
                                ArrayRef<MachineOperand> MOs,
4617
                                MachineBasicBlock::iterator InsertPt,
4618
288
                                MachineInstr &MI) {
4619
288
  MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
4620
288
                                    MI.getDebugLoc(), TII.get(Opcode));
4621
288
  addOperands(MIB, MOs);
4622
288
  return MIB.addImm(0);
4623
288
}
4624
4625
MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
4626
    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
4627
    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
4628
155k
    unsigned Size, unsigned Align) const {
4629
155k
  switch (MI.getOpcode()) {
4630
155k
  case X86::INSERTPSrr:
4631
86
  case X86::VINSERTPSrr:
4632
86
  case X86::VINSERTPSZrr:
4633
86
    // Attempt to convert the load of inserted vector into a fold load
4634
86
    // of a single float.
4635
86
    if (OpNum == 2) {
4636
35
      unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm();
4637
35
      unsigned ZMask = Imm & 15;
4638
35
      unsigned DstIdx = (Imm >> 4) & 3;
4639
35
      unsigned SrcIdx = (Imm >> 6) & 3;
4640
35
4641
35
      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
4642
35
      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
4643
35
      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
4644
35
      if ((Size == 0 || 
Size >= 1616
) && RCSize >= 16 && 4 <= Align) {
4645
35
        int PtrOffset = SrcIdx * 4;
4646
35
        unsigned NewImm = (DstIdx << 4) | ZMask;
4647
35
        unsigned NewOpCode =
4648
35
            (MI.getOpcode() == X86::VINSERTPSZrr) ? 
X86::VINSERTPSZrm2
:
4649
35
            
(MI.getOpcode() == X86::VINSERTPSrr) 33
?
X86::VINSERTPSrm26
:
4650
33
                                                    
X86::INSERTPSrm7
;
4651
35
        MachineInstr *NewMI =
4652
35
            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, PtrOffset);
4653
35
        NewMI->getOperand(NewMI->getNumOperands() - 1).setImm(NewImm);
4654
35
        return NewMI;
4655
35
      }
4656
51
    }
4657
51
    break;
4658
51
  case X86::MOVHLPSrr:
4659
16
  case X86::VMOVHLPSrr:
4660
16
  case X86::VMOVHLPSZrr:
4661
16
    // Move the upper 64-bits of the second operand to the lower 64-bits.
4662
16
    // To fold the load, adjust the pointer to the upper and use (V)MOVLPS.
4663
16
    // TODO: In most cases AVX doesn't have a 8-byte alignment requirement.
4664
16
    if (OpNum == 2) {
4665
16
      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
4666
16
      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
4667
16
      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
4668
16
      if ((Size == 0 || 
Size >= 166
) && RCSize >= 16 && 8 <= Align) {
4669
16
        unsigned NewOpCode =
4670
16
            (MI.getOpcode() == X86::VMOVHLPSZrr) ? 
X86::VMOVLPSZ128rm1
:
4671
16
            
(MI.getOpcode() == X86::VMOVHLPSrr) 15
?
X86::VMOVLPSrm8
:
4672
15
                                                   
X86::MOVLPSrm7
;
4673
16
        MachineInstr *NewMI =
4674
16
            FuseInst(MF, NewOpCode, OpNum, MOs, InsertPt, MI, *this, 8);
4675
16
        return NewMI;
4676
16
      }
4677
0
    }
4678
0
    break;
4679
113
  case X86::UNPCKLPDrr:
4680
113
    // If we won't be able to fold this to the memory form of UNPCKL, use
4681
113
    // MOVHPD instead. Done as custom because we can't have this in the load
4682
113
    // table twice.
4683
113
    if (OpNum == 2) {
4684
91
      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
4685
91
      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
4686
91
      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
4687
91
      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
4688
1
        MachineInstr *NewMI =
4689
1
            FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
4690
1
        return NewMI;
4691
1
      }
4692
112
    }
4693
112
    break;
4694
155k
  }
4695
155k
4696
155k
  return nullptr;
4697
155k
}
4698
4699
static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
4700
305k
                                               MachineInstr &MI) {
4701
305k
  if (!hasUndefRegUpdate(MI.getOpcode(), /*ForLoadFold*/true) ||
4702
305k
      
!MI.getOperand(1).isReg()123
)
4703
305k
    return false;
4704
123
4705
123
  // The are two cases we need to handle depending on where in the pipeline
4706
123
  // the folding attempt is being made.
4707
123
  // -Register has the undef flag set.
4708
123
  // -Register is produced by the IMPLICIT_DEF instruction.
4709
123
4710
123
  if (MI.getOperand(1).isUndef())
4711
2
    return true;
4712
121
4713
121
  MachineRegisterInfo &RegInfo = MF.getRegInfo();
4714
121
  MachineInstr *VRegDef = RegInfo.getUniqueVRegDef(MI.getOperand(1).getReg());
4715
121
  return VRegDef && VRegDef->isImplicitDef();
4716
121
}
4717
4718
4719
MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
4720
    MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
4721
    ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
4722
155k
    unsigned Size, unsigned Align, bool AllowCommute) const {
4723
155k
  bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
4724
155k
  bool isTwoAddrFold = false;
4725
155k
4726
155k
  // For CPUs that favor the register form of a call or push,
4727
155k
  // do not fold loads into calls or pushes, unless optimizing for size
4728
155k
  // aggressively.
4729
155k
  if (isSlowTwoMemOps && 
!MF.getFunction().hasMinSize()179
&&
4730
155k
      
(176
MI.getOpcode() == X86::CALL32r176
||
MI.getOpcode() == X86::CALL64r175
||
4731
176
       
MI.getOpcode() == X86::PUSH16r168
||
MI.getOpcode() == X86::PUSH32r168
||
4732
176
       
MI.getOpcode() == X86::PUSH64r166
))
4733
10
    return nullptr;
4734
155k
4735
155k
  // Avoid partial and undef register update stalls unless optimizing for size.
4736
155k
  if (!MF.getFunction().hasOptSize() &&
4737
155k
      
(153k
hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true)153k
||
4738
153k
       
shouldPreventUndefRegUpdateMemFold(MF, MI)153k
))
4739
10
    return nullptr;
4740
155k
4741
155k
  unsigned NumOps = MI.getDesc().getNumOperands();
4742
155k
  bool isTwoAddr =
4743
155k
      NumOps > 1 && 
MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1152k
;
4744
155k
4745
155k
  // FIXME: AsmPrinter doesn't know how to handle
4746
155k
  // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
4747
155k
  if (MI.getOpcode() == X86::ADD32ri &&
4748
155k
      
MI.getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS0
)
4749
0
    return nullptr;
4750
155k
4751
155k
  // GOTTPOFF relocation loads can only be folded into add instructions.
4752
155k
  // FIXME: Need to exclude other relocations that only support specific
4753
155k
  // instructions.
4754
155k
  if (MOs.size() == X86::AddrNumOperands &&
4755
155k
      
MOs[X86::AddrDisp].getTargetFlags() == X86II::MO_GOTTPOFF76.6k
&&
4756
155k
      
MI.getOpcode() != X86::ADD64rr2
)
4757
2
    return nullptr;
4758
155k
4759
155k
  MachineInstr *NewMI = nullptr;
4760
155k
4761
155k
  // Attempt to fold any custom cases we have.
4762
155k
  if (MachineInstr *CustomMI =
4763
52
          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
4764
52
    return CustomMI;
4765
155k
4766
155k
  const X86MemoryFoldTableEntry *I = nullptr;
4767
155k
4768
155k
  // Folding a memory location into the two-address part of a two-address
4769
155k
  // instruction is different than folding it other places.  It requires
4770
155k
  // replacing the *two* registers with the memory location.
4771
155k
  if (isTwoAddr && 
NumOps >= 221.8k
&&
OpNum < 221.8k
&&
MI.getOperand(0).isReg()13.1k
&&
4772
155k
      
MI.getOperand(1).isReg()13.1k
&&
4773
155k
      
MI.getOperand(0).getReg() == MI.getOperand(1).getReg()13.1k
) {
4774
839
    I = lookupTwoAddrFoldTable(MI.getOpcode());
4775
839
    isTwoAddrFold = true;
4776
154k
  } else {
4777
154k
    if (OpNum == 0) {
4778
42.3k
      if (MI.getOpcode() == X86::MOV32r0) {
4779
288
        NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, InsertPt, MI);
4780
288
        if (NewMI)
4781
288
          return NewMI;
4782
154k
      }
4783
42.3k
    }
4784
154k
4785
154k
    I = lookupFoldTable(MI.getOpcode(), OpNum);
4786
154k
  }
4787
155k
4788
155k
  
if (155k
I != nullptr155k
) {
4789
19.2k
    unsigned Opcode = I->DstOp;
4790
19.2k
    unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
4791
19.2k
    if (Align < MinAlign)
4792
747
      return nullptr;
4793
18.5k
    bool NarrowToMOV32rm = false;
4794
18.5k
    if (Size) {
4795
11.7k
      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
4796
11.7k
      const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum,
4797
11.7k
                                                  &RI, MF);
4798
11.7k
      unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
4799
11.7k
      if (Size < RCSize) {
4800
2
        // FIXME: Allow scalar intrinsic instructions like ADDSSrm_Int.
4801
2
        // Check if it's safe to fold the load. If the size of the object is
4802
2
        // narrower than the load width, then it's not.
4803
2
        if (Opcode != X86::MOV64rm || 
RCSize != 80
||
Size != 40
)
4804
2
          return nullptr;
4805
0
        // If this is a 64-bit load, but the spill slot is 32, then we can do
4806
0
        // a 32-bit load which is implicitly zero-extended. This likely is
4807
0
        // due to live interval analysis remat'ing a load from stack slot.
4808
0
        if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
4809
0
          return nullptr;
4810
0
        Opcode = X86::MOV32rm;
4811
0
        NarrowToMOV32rm = true;
4812
0
      }
4813
11.7k
    }
4814
18.5k
4815
18.5k
    
if (18.5k
isTwoAddrFold18.5k
)
4816
523
      NewMI = FuseTwoAddrInst(MF, Opcode, MOs, InsertPt, MI, *this);
4817
18.0k
    else
4818
18.0k
      NewMI = FuseInst(MF, Opcode, OpNum, MOs, InsertPt, MI, *this);
4819
18.5k
4820
18.5k
    if (NarrowToMOV32rm) {
4821
0
      // If this is the special case where we use a MOV32rm to load a 32-bit
4822
0
      // value and zero-extend the top bits. Change the destination register
4823
0
      // to a 32-bit one.
4824
0
      unsigned DstReg = NewMI->getOperand(0).getReg();
4825
0
      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
4826
0
        NewMI->getOperand(0).setReg(RI.getSubReg(DstReg, X86::sub_32bit));
4827
0
      else
4828
0
        NewMI->getOperand(0).setSubReg(X86::sub_32bit);
4829
0
    }
4830
18.5k
    return NewMI;
4831
136k
  }
4832
136k
4833
136k
  // If the instruction and target operand are commutable, commute the
4834
136k
  // instruction and try again.
4835
136k
  if (AllowCommute) {
4836
135k
    unsigned CommuteOpIdx1 = OpNum, CommuteOpIdx2 = CommuteAnyOperandIndex;
4837
135k
    if (findCommutedOpIndices(MI, CommuteOpIdx1, CommuteOpIdx2)) {
4838
1.95k
      bool HasDef = MI.getDesc().getNumDefs();
4839
1.95k
      Register Reg0 = HasDef ? 
MI.getOperand(0).getReg()1.86k
:
Register()88
;
4840
1.95k
      Register Reg1 = MI.getOperand(CommuteOpIdx1).getReg();
4841
1.95k
      Register Reg2 = MI.getOperand(CommuteOpIdx2).getReg();
4842
1.95k
      bool Tied1 =
4843
1.95k
          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx1, MCOI::TIED_TO);
4844
1.95k
      bool Tied2 =
4845
1.95k
          0 == MI.getDesc().getOperandConstraint(CommuteOpIdx2, MCOI::TIED_TO);
4846
1.95k
4847
1.95k
      // If either of the commutable operands are tied to the destination
4848
1.95k
      // then we can not commute + fold.
4849
1.95k
      if ((HasDef && 
Reg0 == Reg11.86k
&&
Tied10
) ||
4850
1.95k
          (HasDef && 
Reg0 == Reg21.86k
&&
Tied23
))
4851
3
        return nullptr;
4852
1.95k
4853
1.95k
      MachineInstr *CommutedMI =
4854
1.95k
          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
4855
1.95k
      if (!CommutedMI) {
4856
39
        // Unable to commute.
4857
39
        return nullptr;
4858
39
      }
4859
1.91k
      if (CommutedMI != &MI) {
4860
0
        // New instruction. We can't fold from this.
4861
0
        CommutedMI->eraseFromParent();
4862
0
        return nullptr;
4863
0
      }
4864
1.91k
4865
1.91k
      // Attempt to fold with the commuted version of the instruction.
4866
1.91k
      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
4867
1.91k
                                    Size, Align, /*AllowCommute=*/false);
4868
1.91k
      if (NewMI)
4869
480
        return NewMI;
4870
1.43k
4871
1.43k
      // Folding failed again - undo the commute before returning.
4872
1.43k
      MachineInstr *UncommutedMI =
4873
1.43k
          commuteInstruction(MI, false, CommuteOpIdx1, CommuteOpIdx2);
4874
1.43k
      if (!UncommutedMI) {
4875
0
        // Unable to commute.
4876
0
        return nullptr;
4877
0
      }
4878
1.43k
      if (UncommutedMI != &MI) {
4879
0
        // New instruction. It doesn't need to be kept.
4880
0
        UncommutedMI->eraseFromParent();
4881
0
        return nullptr;
4882
0
      }
4883
1.43k
4884
1.43k
      // Return here to prevent duplicate fuse failure report.
4885
1.43k
      return nullptr;
4886
1.43k
    }
4887
135k
  }
4888
134k
4889
134k
  // No fusion
4890
134k
  if (PrintFailedFusing && 
!MI.isCopy()40
)
4891
1
    dbgs() << "We failed to fuse operand " << OpNum << " in " << MI;
4892
134k
  return nullptr;
4893
134k
}
4894
4895
MachineInstr *
4896
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
4897
                                    ArrayRef<unsigned> Ops,
4898
                                    MachineBasicBlock::iterator InsertPt,
4899
                                    int FrameIndex, LiveIntervals *LIS,
4900
79.7k
                                    VirtRegMap *VRM) const {
4901
79.7k
  // Check switch flag
4902
79.7k
  if (NoFusing)
4903
0
    return nullptr;
4904
79.7k
4905
79.7k
  // Avoid partial and undef register update stalls unless optimizing for size.
4906
79.7k
  if (!MF.getFunction().hasOptSize() &&
4907
79.7k
      
(78.0k
hasPartialRegUpdate(MI.getOpcode(), Subtarget, /*ForLoadFold*/true)78.0k
||
4908
78.0k
       
shouldPreventUndefRegUpdateMemFold(MF, MI)78.0k
))
4909
52
    return nullptr;
4910
79.6k
4911
79.6k
  // Don't fold subreg spills, or reloads that use a high subreg.
4912
80.5k
  
for (auto Op : Ops)79.6k
{
4913
80.5k
    MachineOperand &MO = MI.getOperand(Op);
4914
80.5k
    auto SubReg = MO.getSubReg();
4915
80.5k
    if (SubReg && 
(1.69k
MO.isDef()1.69k
||
SubReg == X86::sub_8bit_hi1.08k
))
4916
613
      return nullptr;
4917
80.5k
  }
4918
79.6k
4919
79.6k
  const MachineFrameInfo &MFI = MF.getFrameInfo();
4920
79.0k
  unsigned Size = MFI.getObjectSize(FrameIndex);
4921
79.0k
  unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
4922
79.0k
  // If the function stack isn't realigned we don't want to fold instructions
4923
79.0k
  // that need increased alignment.
4924
79.0k
  if (!RI.needsStackRealignment(MF))
4925
76.7k
    Alignment =
4926
76.7k
        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
4927
79.0k
  if (Ops.size() == 2 && 
Ops[0] == 0866
&&
Ops[1] == 1707
) {
4928
702
    unsigned NewOpc = 0;
4929
702
    unsigned RCSize = 0;
4930
702
    switch (MI.getOpcode()) {
4931
702
    
default: return nullptr79
;
4932
702
    
case X86::TEST8rr: NewOpc = X86::CMP8ri; RCSize = 1; break236
;
4933
702
    
case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break1
;
4934
702
    
case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break223
;
4935
702
    
case X86::TEST64rr: NewOpc = X86::CMP64ri8; RCSize = 8; break163
;
4936
623
    }
4937
623
    // Check if it's safe to fold the load. If the size of the object is
4938
623
    // narrower than the load width, then it's not.
4939
623
    if (Size < RCSize)
4940
0
      return nullptr;
4941
623
    // Change to CMPXXri r, 0 first.
4942
623
    MI.setDesc(get(NewOpc));
4943
623
    MI.getOperand(1).ChangeToImmediate(0);
4944
78.3k
  } else if (Ops.size() != 1)