Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines a DAG pattern matching instruction selector for X86,
10
// converting from a legalized dag to a X86 dag.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "X86.h"
15
#include "X86MachineFunctionInfo.h"
16
#include "X86RegisterInfo.h"
17
#include "X86Subtarget.h"
18
#include "X86TargetMachine.h"
19
#include "llvm/ADT/Statistic.h"
20
#include "llvm/CodeGen/MachineFrameInfo.h"
21
#include "llvm/CodeGen/MachineFunction.h"
22
#include "llvm/CodeGen/SelectionDAGISel.h"
23
#include "llvm/Config/llvm-config.h"
24
#include "llvm/IR/ConstantRange.h"
25
#include "llvm/IR/Function.h"
26
#include "llvm/IR/Instructions.h"
27
#include "llvm/IR/Intrinsics.h"
28
#include "llvm/IR/Type.h"
29
#include "llvm/Support/Debug.h"
30
#include "llvm/Support/ErrorHandling.h"
31
#include "llvm/Support/KnownBits.h"
32
#include "llvm/Support/MathExtras.h"
33
#include "llvm/Support/raw_ostream.h"
34
#include "llvm/Target/TargetMachine.h"
35
#include "llvm/Target/TargetOptions.h"
36
#include <stdint.h>
37
using namespace llvm;
38
39
#define DEBUG_TYPE "x86-isel"
40
41
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
42
43
static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
44
    cl::desc("Enable setting constant bits to reduce size of mask immediates"),
45
    cl::Hidden);
46
47
//===----------------------------------------------------------------------===//
48
//                      Pattern Matcher Implementation
49
//===----------------------------------------------------------------------===//
50
51
namespace {
52
  /// This corresponds to X86AddressMode, but uses SDValue's instead of register
53
  /// numbers for the leaves of the matched tree.
54
  struct X86ISelAddressMode {
55
    enum {
56
      RegBase,
57
      FrameIndexBase
58
    } BaseType;
59
60
    // This is really a union, discriminated by BaseType!
61
    SDValue Base_Reg;
62
    int Base_FrameIndex;
63
64
    unsigned Scale;
65
    SDValue IndexReg;
66
    int32_t Disp;
67
    SDValue Segment;
68
    const GlobalValue *GV;
69
    const Constant *CP;
70
    const BlockAddress *BlockAddr;
71
    const char *ES;
72
    MCSymbol *MCSym;
73
    int JT;
74
    unsigned Align;    // CP alignment.
75
    unsigned char SymbolFlags;  // X86II::MO_*
76
    bool NegateIndex = false;
77
78
    X86ISelAddressMode()
79
        : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
80
          Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
81
677k
          MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
82
83
576k
    bool hasSymbolicDisplacement() const {
84
576k
      return GV != nullptr || 
CP != nullptr524k
||
ES != nullptr499k
||
85
576k
             
MCSym != nullptr499k
||
JT != -1499k
||
BlockAddr != nullptr499k
;
86
576k
    }
87
88
74.8k
    bool hasBaseOrIndexReg() const {
89
74.8k
      return BaseType == FrameIndexBase ||
90
74.8k
             
IndexReg.getNode() != nullptr74.8k
||
Base_Reg.getNode() != nullptr74.1k
;
91
74.8k
    }
92
93
    /// Return true if this addressing mode is already RIP-relative.
94
1.54M
    bool isRIPRelative() const {
95
1.54M
      if (BaseType != RegBase) 
return false53.6k
;
96
1.49M
      if (RegisterSDNode *RegNode =
97
5.94k
            dyn_cast_or_null<RegisterSDNode>(Base_Reg.getNode()))
98
5.94k
        return RegNode->getReg() == X86::RIP;
99
1.48M
      return false;
100
1.48M
    }
101
102
72.4k
    void setBaseReg(SDValue Reg) {
103
72.4k
      BaseType = RegBase;
104
72.4k
      Base_Reg = Reg;
105
72.4k
    }
106
107
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
108
    void dump(SelectionDAG *DAG = nullptr) {
109
      dbgs() << "X86ISelAddressMode " << this << '\n';
110
      dbgs() << "Base_Reg ";
111
      if (Base_Reg.getNode())
112
        Base_Reg.getNode()->dump(DAG);
113
      else
114
        dbgs() << "nul\n";
115
      if (BaseType == FrameIndexBase)
116
        dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n';
117
      dbgs() << " Scale " << Scale << '\n'
118
             << "IndexReg ";
119
      if (NegateIndex)
120
        dbgs() << "negate ";
121
      if (IndexReg.getNode())
122
        IndexReg.getNode()->dump(DAG);
123
      else
124
        dbgs() << "nul\n";
125
      dbgs() << " Disp " << Disp << '\n'
126
             << "GV ";
127
      if (GV)
128
        GV->dump();
129
      else
130
        dbgs() << "nul";
131
      dbgs() << " CP ";
132
      if (CP)
133
        CP->dump();
134
      else
135
        dbgs() << "nul";
136
      dbgs() << '\n'
137
             << "ES ";
138
      if (ES)
139
        dbgs() << ES;
140
      else
141
        dbgs() << "nul";
142
      dbgs() << " MCSym ";
143
      if (MCSym)
144
        dbgs() << MCSym;
145
      else
146
        dbgs() << "nul";
147
      dbgs() << " JT" << JT << " Align" << Align << '\n';
148
    }
149
#endif
150
  };
151
}
152
153
namespace {
154
  //===--------------------------------------------------------------------===//
155
  /// ISel - X86-specific code to select X86 machine instructions for
156
  /// SelectionDAG operations.
157
  ///
158
  class X86DAGToDAGISel final : public SelectionDAGISel {
159
    /// Keep a pointer to the X86Subtarget around so that we can
160
    /// make the right decision when generating code for different targets.
161
    const X86Subtarget *Subtarget;
162
163
    /// If true, selector should try to optimize for code size instead of
164
    /// performance.
165
    bool OptForSize;
166
167
    /// If true, selector should try to optimize for minimum code size.
168
    bool OptForMinSize;
169
170
    /// Disable direct TLS access through segment registers.
171
    bool IndirectTlsSegRefs;
172
173
  public:
174
    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
175
        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
176
12.1k
          OptForMinSize(false), IndirectTlsSegRefs(false) {}
177
178
137k
    StringRef getPassName() const override {
179
137k
      return "X86 DAG->DAG Instruction Selection";
180
137k
    }
181
182
137k
    bool runOnMachineFunction(MachineFunction &MF) override {
183
137k
      // Reset the subtarget each time through.
184
137k
      Subtarget = &MF.getSubtarget<X86Subtarget>();
185
137k
      IndirectTlsSegRefs = MF.getFunction().hasFnAttribute(
186
137k
                             "indirect-tls-seg-refs");
187
137k
188
137k
      // OptFor[Min]Size are used in pattern predicates that isel is matching.
189
137k
      OptForSize = MF.getFunction().hasOptSize();
190
137k
      OptForMinSize = MF.getFunction().hasMinSize();
191
137k
      assert((!OptForMinSize || OptForSize) &&
192
137k
             "OptForMinSize implies OptForSize");
193
137k
194
137k
      SelectionDAGISel::runOnMachineFunction(MF);
195
137k
      return true;
196
137k
    }
197
198
    void EmitFunctionEntryCode() override;
199
200
    bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
201
202
    void PreprocessISelDAG() override;
203
    void PostprocessISelDAG() override;
204
205
// Include the pieces autogenerated from the target description.
206
#include "X86GenDAGISel.inc"
207
208
  private:
209
    void Select(SDNode *N) override;
210
211
    bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
212
    bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
213
    bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
214
    bool matchAddress(SDValue N, X86ISelAddressMode &AM);
215
    bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
216
    bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth);
217
    bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
218
                                 unsigned Depth);
219
    bool matchAddressBase(SDValue N, X86ISelAddressMode &AM);
220
    bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
221
                    SDValue &Scale, SDValue &Index, SDValue &Disp,
222
                    SDValue &Segment);
223
    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
224
                          SDValue &Scale, SDValue &Index, SDValue &Disp,
225
                          SDValue &Segment);
226
    bool selectMOV64Imm32(SDValue N, SDValue &Imm);
227
    bool selectLEAAddr(SDValue N, SDValue &Base,
228
                       SDValue &Scale, SDValue &Index, SDValue &Disp,
229
                       SDValue &Segment);
230
    bool selectLEA64_32Addr(SDValue N, SDValue &Base,
231
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
232
                            SDValue &Segment);
233
    bool selectTLSADDRAddr(SDValue N, SDValue &Base,
234
                           SDValue &Scale, SDValue &Index, SDValue &Disp,
235
                           SDValue &Segment);
236
    bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
237
                             SDValue &Base, SDValue &Scale,
238
                             SDValue &Index, SDValue &Disp,
239
                             SDValue &Segment,
240
                             SDValue &NodeWithChain);
241
    bool selectRelocImm(SDValue N, SDValue &Op);
242
243
    bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
244
                     SDValue &Base, SDValue &Scale,
245
                     SDValue &Index, SDValue &Disp,
246
                     SDValue &Segment);
247
248
    // Convenience method where P is also root.
249
    bool tryFoldLoad(SDNode *P, SDValue N,
250
                     SDValue &Base, SDValue &Scale,
251
                     SDValue &Index, SDValue &Disp,
252
6.34k
                     SDValue &Segment) {
253
6.34k
      return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
254
6.34k
    }
255
256
    /// Implement addressing mode selection for inline asm expressions.
257
    bool SelectInlineAsmMemoryOperand(const SDValue &Op,
258
                                      unsigned ConstraintID,
259
                                      std::vector<SDValue> &OutOps) override;
260
261
    void emitSpecialCodeForMain();
262
263
    inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL,
264
                                   MVT VT, SDValue &Base, SDValue &Scale,
265
                                   SDValue &Index, SDValue &Disp,
266
579k
                                   SDValue &Segment) {
267
579k
      if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
268
148k
        Base = CurDAG->getTargetFrameIndex(
269
148k
            AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout()));
270
431k
      else if (AM.Base_Reg.getNode())
271
423k
        Base = AM.Base_Reg;
272
8.22k
      else
273
8.22k
        Base = CurDAG->getRegister(0, VT);
274
579k
275
579k
      Scale = getI8Imm(AM.Scale, DL);
276
579k
277
579k
      // Negate the index if needed.
278
579k
      if (AM.NegateIndex) {
279
125
        unsigned NegOpc = VT == MVT::i64 ? 
X86::NEG64r113
:
X86::NEG32r12
;
280
125
        SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32,
281
125
                                                     AM.IndexReg), 0);
282
125
        AM.IndexReg = Neg;
283
125
      }
284
579k
285
579k
      if (AM.IndexReg.getNode())
286
54.5k
        Index = AM.IndexReg;
287
524k
      else
288
524k
        Index = CurDAG->getRegister(0, VT);
289
579k
290
579k
      // These are 32-bit even in 64-bit mode since RIP-relative offset
291
579k
      // is 32-bit.
292
579k
      if (AM.GV)
293
85.2k
        Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(),
294
85.2k
                                              MVT::i32, AM.Disp,
295
85.2k
                                              AM.SymbolFlags);
296
494k
      else if (AM.CP)
297
31.5k
        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
298
31.5k
                                             AM.Align, AM.Disp, AM.SymbolFlags);
299
462k
      else if (AM.ES) {
300
96
        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
301
96
        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
302
462k
      } else if (AM.MCSym) {
303
12
        assert(!AM.Disp && "Non-zero displacement is ignored with MCSym.");
304
12
        assert(AM.SymbolFlags == 0 && "oo");
305
12
        Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32);
306
462k
      } else if (AM.JT != -1) {
307
327
        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
308
327
        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
309
462k
      } else if (AM.BlockAddr)
310
17
        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
311
17
                                             AM.SymbolFlags);
312
462k
      else
313
462k
        Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32);
314
579k
315
579k
      if (AM.Segment.getNode())
316
928
        Segment = AM.Segment;
317
578k
      else
318
578k
        Segment = CurDAG->getRegister(0, MVT::i16);
319
579k
    }
320
321
    // Utility function to determine whether we should avoid selecting
322
    // immediate forms of instructions for better code size or not.
323
    // At a high level, we'd like to avoid such instructions when
324
    // we have similar constants used within the same basic block
325
    // that can be kept in a register.
326
    //
327
513k
    bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const {
328
513k
      uint32_t UseCount = 0;
329
513k
330
513k
      // Do not want to hoist if we're not optimizing for size.
331
513k
      // TODO: We'd like to remove this restriction.
332
513k
      // See the comment in X86InstrInfo.td for more info.
333
513k
      if (!OptForSize)
334
503k
        return false;
335
9.66k
336
9.66k
      // Walk all the users of the immediate.
337
9.66k
      for (SDNode::use_iterator UI = N->use_begin(),
338
22.7k
           UE = N->use_end(); (UI != UE) && 
(UseCount < 2)13.9k
;
++UI13.0k
) {
339
13.0k
340
13.0k
        SDNode *User = *UI;
341
13.0k
342
13.0k
        // This user is already selected. Count it as a legitimate use and
343
13.0k
        // move on.
344
13.0k
        if (User->isMachineOpcode()) {
345
982
          UseCount++;
346
982
          continue;
347
982
        }
348
12.1k
349
12.1k
        // We want to count stores of immediates as real uses.
350
12.1k
        if (User->getOpcode() == ISD::STORE &&
351
12.1k
            
User->getOperand(1).getNode() == N2.33k
) {
352
2.30k
          UseCount++;
353
2.30k
          continue;
354
2.30k
        }
355
9.80k
356
9.80k
        // We don't currently match users that have > 2 operands (except
357
9.80k
        // for stores, which are handled above)
358
9.80k
        // Those instruction won't match in ISEL, for now, and would
359
9.80k
        // be counted incorrectly.
360
9.80k
        // This may change in the future as we add additional instruction
361
9.80k
        // types.
362
9.80k
        if (User->getNumOperands() != 2)
363
1.67k
          continue;
364
8.12k
365
8.12k
        // Immediates that are used for offsets as part of stack
366
8.12k
        // manipulation should be left alone. These are typically
367
8.12k
        // used to indicate SP offsets for argument passing and
368
8.12k
        // will get pulled into stores/pushes (implicitly).
369
8.12k
        if (User->getOpcode() == X86ISD::ADD ||
370
8.12k
            
User->getOpcode() == ISD::ADD7.65k
||
371
8.12k
            
User->getOpcode() == X86ISD::SUB6.60k
||
372
8.12k
            
User->getOpcode() == ISD::SUB3.89k
) {
373
4.25k
374
4.25k
          // Find the other operand of the add/sub.
375
4.25k
          SDValue OtherOp = User->getOperand(0);
376
4.25k
          if (OtherOp.getNode() == N)
377
633
            OtherOp = User->getOperand(1);
378
4.25k
379
4.25k
          // Don't count if the other operand is SP.
380
4.25k
          RegisterSDNode *RegNode;
381
4.25k
          if (OtherOp->getOpcode() == ISD::CopyFromReg &&
382
4.25k
              (RegNode = dyn_cast_or_null<RegisterSDNode>(
383
1.26k
                 OtherOp->getOperand(1).getNode())))
384
1.26k
            if ((RegNode->getReg() == X86::ESP) ||
385
1.26k
                
(RegNode->getReg() == X86::RSP)1.21k
)
386
49
              continue;
387
8.07k
        }
388
8.07k
389
8.07k
        // ... otherwise, count this and move on.
390
8.07k
        UseCount++;
391
8.07k
      }
392
9.66k
393
9.66k
      // If we have more than 1 use, then recommend for hoisting.
394
9.66k
      return (UseCount > 1);
395
9.66k
    }
396
397
    /// Return a target constant with the specified value of type i8.
398
598k
    inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) {
399
598k
      return CurDAG->getTargetConstant(Imm, DL, MVT::i8);
400
598k
    }
401
402
    /// Return a target constant with the specified value, of type i32.
403
6.27k
    inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
404
6.27k
      return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
405
6.27k
    }
406
407
    /// Return a target constant with the specified value, of type i64.
408
56
    inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) {
409
56
      return CurDAG->getTargetConstant(Imm, DL, MVT::i64);
410
56
    }
411
412
    SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
413
9.89k
                                        const SDLoc &DL) {
414
9.89k
      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
415
9.89k
      uint64_t Index = N->getConstantOperandVal(1);
416
9.89k
      MVT VecVT = N->getOperand(0).getSimpleValueType();
417
9.89k
      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
418
9.89k
    }
419
420
    SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
421
4.74k
                                      const SDLoc &DL) {
422
4.74k
      assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
423
4.74k
      uint64_t Index = N->getConstantOperandVal(2);
424
4.74k
      MVT VecVT = N->getSimpleValueType(0);
425
4.74k
      return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
426
4.74k
    }
427
428
    // Helper to detect unneeded and instructions on shift amounts. Called
429
    // from PatFrags in tablegen.
430
11.4k
    bool isUnneededShiftMask(SDNode *N, unsigned Width) const {
431
11.4k
      assert(N->getOpcode() == ISD::AND && "Unexpected opcode");
432
11.4k
      const APInt &Val = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
433
11.4k
434
11.4k
      if (Val.countTrailingOnes() >= Width)
435
4.45k
        return true;
436
7.04k
437
7.04k
      APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero;
438
7.04k
      return Mask.countTrailingOnes() >= Width;
439
7.04k
    }
440
441
    /// Return an SDNode that returns the value of the global base register.
442
    /// Output instructions required to initialize the global base register,
443
    /// if necessary.
444
    SDNode *getGlobalBaseReg();
445
446
    /// Return a reference to the TargetMachine, casted to the target-specific
447
    /// type.
448
594
    const X86TargetMachine &getTargetMachine() const {
449
594
      return static_cast<const X86TargetMachine &>(TM);
450
594
    }
451
452
    /// Return a reference to the TargetInstrInfo, casted to the target-specific
453
    /// type.
454
8.72k
    const X86InstrInfo *getInstrInfo() const {
455
8.72k
      return Subtarget->getInstrInfo();
456
8.72k
    }
457
458
    /// Address-mode matching performs shift-of-and to and-of-shift
459
    /// reassociation in order to expose more scaled addressing
460
    /// opportunities.
461
902k
    bool ComplexPatternFuncMutatesDAG() const override {
462
902k
      return true;
463
902k
    }
464
465
    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
466
467
    /// Returns whether this is a relocatable immediate in the range
468
    /// [-2^Width .. 2^Width-1].
469
197k
    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470
197k
      if (auto *CN = dyn_cast<ConstantSDNode>(N))
471
22.5k
        return isInt<Width>(CN->getSExtValue());
472
174k
      return isSExtAbsoluteSymbolRef(Width, N);
473
174k
    }
X86ISelDAGToDAG.cpp:bool (anonymous namespace)::X86DAGToDAGISel::isSExtRelocImm<32u>(llvm::SDNode*) const
Line
Count
Source
469
161k
    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470
161k
      if (auto *CN = dyn_cast<ConstantSDNode>(N))
471
21.8k
        return isInt<Width>(CN->getSExtValue());
472
139k
      return isSExtAbsoluteSymbolRef(Width, N);
473
139k
    }
X86ISelDAGToDAG.cpp:bool (anonymous namespace)::X86DAGToDAGISel::isSExtRelocImm<8u>(llvm::SDNode*) const
Line
Count
Source
469
36.2k
    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
470
36.2k
      if (auto *CN = dyn_cast<ConstantSDNode>(N))
471
759
        return isInt<Width>(CN->getSExtValue());
472
35.5k
      return isSExtAbsoluteSymbolRef(Width, N);
473
35.5k
    }
474
475
    // Indicates we should prefer to use a non-temporal load for this load.
476
124k
    bool useNonTemporalLoad(LoadSDNode *N) const {
477
124k
      if (!N->isNonTemporal())
478
124k
        return false;
479
582
480
582
      unsigned StoreSize = N->getMemoryVT().getStoreSize();
481
582
482
582
      if (N->getAlignment() < StoreSize)
483
0
        return false;
484
582
485
582
      switch (StoreSize) {
486
582
      
default: 0
llvm_unreachable0
("Unsupported store size");
487
582
      case 4:
488
14
      case 8:
489
14
        return false;
490
432
      case 16:
491
432
        return Subtarget->hasSSE41();
492
100
      case 32:
493
100
        return Subtarget->hasAVX2();
494
37
      case 64:
495
37
        return Subtarget->hasAVX512();
496
582
      }
497
582
    }
498
499
    bool foldLoadStoreIntoMemOperand(SDNode *Node);
500
    MachineSDNode *matchBEXTRFromAndImm(SDNode *Node);
501
    bool matchBitExtract(SDNode *Node);
502
    bool shrinkAndImmediate(SDNode *N);
503
    bool isMaskZeroExtended(SDNode *N) const;
504
    bool tryShiftAmountMod(SDNode *N);
505
    bool tryShrinkShlLogicImm(SDNode *N);
506
    bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
507
508
    MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
509
                                const SDLoc &dl, MVT VT, SDNode *Node);
510
    MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad,
511
                                const SDLoc &dl, MVT VT, SDNode *Node,
512
                                SDValue &InFlag);
513
514
    bool tryOptimizeRem8Extend(SDNode *N);
515
516
    bool onlyUsesZeroFlag(SDValue Flags) const;
517
    bool hasNoSignFlagUses(SDValue Flags) const;
518
    bool hasNoCarryFlagUses(SDValue Flags) const;
519
  };
520
}
521
522
523
// Returns true if this masked compare can be implemented legally with this
524
// type.
525
2.35k
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
526
2.35k
  unsigned Opcode = N->getOpcode();
527
2.35k
  if (Opcode == X86ISD::CMPM || 
Opcode == ISD::SETCC2.10k
||
528
2.35k
      
Opcode == X86ISD::CMPM_SAE508
||
Opcode == X86ISD::VFPCLASS507
) {
529
1.87k
    // We can get 256-bit 8 element types here without VLX being enabled. When
530
1.87k
    // this happens we will use 512-bit operations and the mask will not be
531
1.87k
    // zero extended.
532
1.87k
    EVT OpVT = N->getOperand(0).getValueType();
533
1.87k
    if (OpVT.is256BitVector() || 
OpVT.is128BitVector()1.28k
)
534
1.56k
      return Subtarget->hasVLX();
535
309
536
309
    return true;
537
309
  }
538
483
  // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check.
539
483
  if (Opcode == X86ISD::VFPCLASSS || 
Opcode == X86ISD::FSETCCM459
||
540
483
      
Opcode == X86ISD::FSETCCM_SAE426
)
541
75
    return true;
542
408
543
408
  return false;
544
408
}
545
546
// Returns true if we can assume the writer of the mask has zero extended it
547
// for us.
548
2.01k
bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
549
2.01k
  // If this is an AND, check if we have a compare on either side. As long as
550
2.01k
  // one side guarantees the mask is zero extended, the AND will preserve those
551
2.01k
  // zeros.
552
2.01k
  if (N->getOpcode() == ISD::AND)
553
871
    return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
554
871
           
isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget)348
;
555
1.13k
556
1.13k
  return isLegalMaskCompare(N, Subtarget);
557
1.13k
}
558
559
bool
560
224k
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
561
224k
  if (OptLevel == CodeGenOpt::None) 
return false235
;
562
223k
563
223k
  if (!N.hasOneUse())
564
98.7k
    return false;
565
125k
566
125k
  if (N.getOpcode() != ISD::LOAD)
567
465
    return true;
568
124k
569
124k
  // Don't fold non-temporal loads if we have an instruction for them.
570
124k
  if (useNonTemporalLoad(cast<LoadSDNode>(N)))
571
463
    return false;
572
124k
573
124k
  // If N is a load, do additional profitability checks.
574
124k
  if (U == Root) {
575
99.0k
    switch (U->getOpcode()) {
576
99.0k
    
default: break63.3k
;
577
99.0k
    case X86ISD::ADD:
578
33.7k
    case X86ISD::ADC:
579
33.7k
    case X86ISD::SUB:
580
33.7k
    case X86ISD::SBB:
581
33.7k
    case X86ISD::AND:
582
33.7k
    case X86ISD::XOR:
583
33.7k
    case X86ISD::OR:
584
33.7k
    case ISD::ADD:
585
33.7k
    case ISD::ADDCARRY:
586
33.7k
    case ISD::AND:
587
33.7k
    case ISD::OR:
588
33.7k
    case ISD::XOR: {
589
33.7k
      SDValue Op1 = U->getOperand(1);
590
33.7k
591
33.7k
      // If the other operand is a 8-bit immediate we should fold the immediate
592
33.7k
      // instead. This reduces code size.
593
33.7k
      // e.g.
594
33.7k
      // movl 4(%esp), %eax
595
33.7k
      // addl $4, %eax
596
33.7k
      // vs.
597
33.7k
      // movl $4, %eax
598
33.7k
      // addl 4(%esp), %eax
599
33.7k
      // The former is 2 bytes shorter. In case where the increment is 1, then
600
33.7k
      // the saving can be 4 bytes (by using incl %eax).
601
33.7k
      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1)) {
602
6.93k
        if (Imm->getAPIntValue().isSignedIntN(8))
603
5.34k
          return false;
604
1.58k
605
1.58k
        // If this is a 64-bit AND with an immediate that fits in 32-bits,
606
1.58k
        // prefer using the smaller and over folding the load. This is needed to
607
1.58k
        // make sure immediates created by shrinkAndImmediate are always folded.
608
1.58k
        // Ideally we would narrow the load during DAG combine and get the
609
1.58k
        // best of both worlds.
610
1.58k
        if (U->getOpcode() == ISD::AND &&
611
1.58k
            
Imm->getAPIntValue().getBitWidth() == 641.04k
&&
612
1.58k
            
Imm->getAPIntValue().isIntN(32)373
)
613
54
          return false;
614
1.52k
615
1.52k
        // If this really a zext_inreg that can be represented with a movzx
616
1.52k
        // instruction, prefer that.
617
1.52k
        // TODO: We could shrink the load and fold if it is non-volatile.
618
1.52k
        if (U->getOpcode() == ISD::AND &&
619
1.52k
            
(989
Imm->getAPIntValue() == UINT8_MAX989
||
620
989
             
Imm->getAPIntValue() == UINT16_MAX985
||
621
989
             
Imm->getAPIntValue() == UINT32_MAX981
))
622
8
          return false;
623
1.52k
624
1.52k
        // ADD/SUB with can negate the immediate and use the opposite operation
625
1.52k
        // to fit 128 into a sign extended 8 bit immediate.
626
1.52k
        if ((U->getOpcode() == ISD::ADD || 
U->getOpcode() == ISD::SUB1.05k
) &&
627
1.52k
            
(-Imm->getAPIntValue()).isSignedIntN(8)464
)
628
13
          return false;
629
28.2k
      }
630
28.2k
631
28.2k
      // If the other operand is a TLS address, we should fold it instead.
632
28.2k
      // This produces
633
28.2k
      // movl    %gs:0, %eax
634
28.2k
      // leal    i@NTPOFF(%eax), %eax
635
28.2k
      // instead of
636
28.2k
      // movl    $i@NTPOFF, %eax
637
28.2k
      // addl    %gs:0, %eax
638
28.2k
      // if the block also has an access to a second TLS address this will save
639
28.2k
      // a load.
640
28.2k
      // FIXME: This is probably also true for non-TLS addresses.
641
28.2k
      if (Op1.getOpcode() == X86ISD::Wrapper) {
642
99
        SDValue Val = Op1.getOperand(0);
643
99
        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
644
92
          return false;
645
28.1k
      }
646
28.1k
647
28.1k
      // Don't fold load if this matches the BTS/BTR/BTC patterns.
648
28.1k
      // BTS: (or X, (shl 1, n))
649
28.1k
      // BTR: (and X, (rotl -2, n))
650
28.1k
      // BTC: (xor X, (shl 1, n))
651
28.1k
      if (U->getOpcode() == ISD::OR || 
U->getOpcode() == ISD::XOR26.1k
) {
652
5.38k
        if (U->getOperand(0).getOpcode() == ISD::SHL &&
653
5.38k
            
isOneConstant(U->getOperand(0).getOperand(0))162
)
654
0
          return false;
655
5.38k
656
5.38k
        if (U->getOperand(1).getOpcode() == ISD::SHL &&
657
5.38k
            
isOneConstant(U->getOperand(1).getOperand(0))238
)
658
12
          return false;
659
28.1k
      }
660
28.1k
      if (U->getOpcode() == ISD::AND) {
661
12.0k
        SDValue U0 = U->getOperand(0);
662
12.0k
        SDValue U1 = U->getOperand(1);
663
12.0k
        if (U0.getOpcode() == ISD::ROTL) {
664
2
          auto *C = dyn_cast<ConstantSDNode>(U0.getOperand(0));
665
2
          if (C && 
C->getSExtValue() == -20
)
666
0
            return false;
667
12.0k
        }
668
12.0k
669
12.0k
        if (U1.getOpcode() == ISD::ROTL) {
670
20
          auto *C = dyn_cast<ConstantSDNode>(U1.getOperand(0));
671
20
          if (C && C->getSExtValue() == -2)
672
20
            return false;
673
28.1k
        }
674
12.0k
      }
675
28.1k
676
28.1k
      break;
677
28.1k
    }
678
28.1k
    case ISD::SHL:
679
2.03k
    case ISD::SRA:
680
2.03k
    case ISD::SRL:
681
2.03k
      // Don't fold a load into a shift by immediate. The BMI2 instructions
682
2.03k
      // support folding a load, but not an immediate. The legacy instructions
683
2.03k
      // support folding an immediate, but can't fold a load. Folding an
684
2.03k
      // immediate is preferable to folding a load.
685
2.03k
      if (isa<ConstantSDNode>(U->getOperand(1)))
686
1.47k
        return false;
687
559
688
559
      break;
689
99.0k
    }
690
99.0k
  }
691
117k
692
117k
  // Prevent folding a load if this can implemented with an insert_subreg or
693
117k
  // a move that implicitly zeroes.
694
117k
  if (Root->getOpcode() == ISD::INSERT_SUBVECTOR &&
695
117k
      
isNullConstant(Root->getOperand(2))504
&&
696
117k
      
(327
Root->getOperand(0).isUndef()327
||
697
327
       
ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode())48
))
698
312
    return false;
699
116k
700
116k
  return true;
701
116k
}
702
703
/// Replace the original chain operand of the call with
704
/// load's chain operand and move load below the call's chain operand.
705
static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
706
1.40k
                               SDValue Call, SDValue OrigChain) {
707
1.40k
  SmallVector<SDValue, 8> Ops;
708
1.40k
  SDValue Chain = OrigChain.getOperand(0);
709
1.40k
  if (Chain.getNode() == Load.getNode())
710
281
    Ops.push_back(Load.getOperand(0));
711
1.12k
  else {
712
1.12k
    assert(Chain.getOpcode() == ISD::TokenFactor &&
713
1.12k
           "Unexpected chain operand");
714
4.08k
    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; 
++i2.96k
)
715
2.96k
      if (Chain.getOperand(i).getNode() == Load.getNode())
716
1.12k
        Ops.push_back(Load.getOperand(0));
717
1.83k
      else
718
1.83k
        Ops.push_back(Chain.getOperand(i));
719
1.12k
    SDValue NewChain =
720
1.12k
      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
721
1.12k
    Ops.clear();
722
1.12k
    Ops.push_back(NewChain);
723
1.12k
  }
724
1.40k
  Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end());
725
1.40k
  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
726
1.40k
  CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
727
1.40k
                             Load.getOperand(1), Load.getOperand(2));
728
1.40k
729
1.40k
  Ops.clear();
730
1.40k
  Ops.push_back(SDValue(Load.getNode(), 1));
731
1.40k
  Ops.append(Call->op_begin() + 1, Call->op_end());
732
1.40k
  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
733
1.40k
}
734
735
/// Return true if call address is a load and it can be
736
/// moved below CALLSEQ_START and the chains leading up to the call.
737
/// Return the CALLSEQ_START by reference as a second output.
738
/// In the case of a tail call, there isn't a callseq node between the call
739
/// chain and the load.
740
141k
static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
741
141k
  // The transformation is somewhat dangerous if the call's chain was glued to
742
141k
  // the call. After MoveBelowOrigChain the load is moved between the call and
743
141k
  // the chain, this can create a cycle if the load is not folded. So it is
744
141k
  // *really* important that we are sure the load will be folded.
745
141k
  if (Callee.getNode() == Chain.getNode() || 
!Callee.hasOneUse()141k
)
746
12.2k
    return false;
747
128k
  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
748
128k
  if (!LD ||
749
128k
      
LD->isVolatile()1.64k
||
750
128k
      
LD->getAddressingMode() != ISD::UNINDEXED1.63k
||
751
128k
      
LD->getExtensionType() != ISD::NON_EXTLOAD1.63k
)
752
127k
    return false;
753
1.63k
754
1.63k
  // Now let's find the callseq_start.
755
4.27k
  
while (1.63k
HasCallSeq &&
Chain.getOpcode() != ISD::CALLSEQ_START4.00k
) {
756
2.64k
    if (!Chain.hasOneUse())
757
0
      return false;
758
2.64k
    Chain = Chain.getOperand(0);
759
2.64k
  }
760
1.63k
761
1.63k
  if (!Chain.getNumOperands())
762
16
    return false;
763
1.61k
  // Since we are not checking for AA here, conservatively abort if the chain
764
1.61k
  // writes to memory. It's not safe to move the callee (a load) across a store.
765
1.61k
  if (isa<MemSDNode>(Chain.getNode()) &&
766
1.61k
      
cast<MemSDNode>(Chain.getNode())->writeMem()2
)
767
2
    return false;
768
1.61k
  if (Chain.getOperand(0).getNode() == Callee.getNode())
769
281
    return true;
770
1.33k
  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
771
1.33k
      
Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode())1.12k
&&
772
1.33k
      
Callee.getValue(1).hasOneUse()1.12k
)
773
1.12k
    return true;
774
209
  return false;
775
209
}
776
777
384k
void X86DAGToDAGISel::PreprocessISelDAG() {
778
384k
  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
779
8.34M
       E = CurDAG->allnodes_end(); I != E; ) {
780
7.95M
    SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
781
7.95M
782
7.95M
    // If this is a target specific AND node with no flag usages, turn it back
783
7.95M
    // into ISD::AND to enable test instruction matching.
784
7.95M
    if (N->getOpcode() == X86ISD::AND && 
!N->hasAnyUseOfValue(1)852
) {
785
24
      SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0),
786
24
                                    N->getOperand(0), N->getOperand(1));
787
24
      --I;
788
24
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
789
24
      ++I;
790
24
      CurDAG->DeleteNode(N);
791
24
      continue;
792
24
    }
793
7.95M
794
7.95M
    switch (N->getOpcode()) {
795
7.95M
    case ISD::FP_TO_SINT:
796
2.39k
    case ISD::FP_TO_UINT: {
797
2.39k
      // Replace vector fp_to_s/uint with their X86 specific equivalent so we
798
2.39k
      // don't need 2 sets of patterns.
799
2.39k
      if (!N->getSimpleValueType(0).isVector())
800
1.19k
        break;
801
1.20k
802
1.20k
      unsigned NewOpc;
803
1.20k
      switch (N->getOpcode()) {
804
1.20k
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
805
1.20k
      
case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break1.03k
;
806
1.20k
      
case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break162
;
807
1.20k
      }
808
1.20k
      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
809
1.20k
                                    N->getOperand(0));
810
1.20k
      --I;
811
1.20k
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
812
1.20k
      ++I;
813
1.20k
      CurDAG->DeleteNode(N);
814
1.20k
      continue;
815
1.20k
    }
816
60.4k
    case ISD::SHL:
817
60.4k
    case ISD::SRA:
818
60.4k
    case ISD::SRL: {
819
60.4k
      // Replace vector shifts with their X86 specific equivalent so we don't
820
60.4k
      // need 2 sets of patterns.
821
60.4k
      if (!N->getValueType(0).isVector())
822
58.2k
        break;
823
2.21k
824
2.21k
      unsigned NewOpc;
825
2.21k
      switch (N->getOpcode()) {
826
2.21k
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
827
2.21k
      
case ISD::SHL: NewOpc = X86ISD::VSHLV; break860
;
828
2.21k
      
case ISD::SRA: NewOpc = X86ISD::VSRAV; break435
;
829
2.21k
      
case ISD::SRL: NewOpc = X86ISD::VSRLV; break915
;
830
2.21k
      }
831
2.21k
      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
832
2.21k
                                    N->getOperand(0), N->getOperand(1));
833
2.21k
      --I;
834
2.21k
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
835
2.21k
      ++I;
836
2.21k
      CurDAG->DeleteNode(N);
837
2.21k
      continue;
838
2.21k
    }
839
5.34k
    case ISD::ANY_EXTEND:
840
5.34k
    case ISD::ANY_EXTEND_VECTOR_INREG: {
841
5.34k
      // Replace vector any extend with the zero extend equivalents so we don't
842
5.34k
      // need 2 sets of patterns. Ignore vXi1 extensions.
843
5.34k
      if (!N->getValueType(0).isVector() ||
844
5.34k
          
N->getOperand(0).getScalarValueSizeInBits() == 1630
)
845
4.85k
        break;
846
496
847
496
      unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
848
496
                            ? 
ISD::ZERO_EXTEND382
849
496
                            : 
ISD::ZERO_EXTEND_VECTOR_INREG114
;
850
496
851
496
      SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
852
496
                                    N->getOperand(0));
853
496
      --I;
854
496
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
855
496
      ++I;
856
496
      CurDAG->DeleteNode(N);
857
496
      continue;
858
496
    }
859
800
    case ISD::FCEIL:
860
800
    case ISD::FFLOOR:
861
800
    case ISD::FTRUNC:
862
800
    case ISD::FNEARBYINT:
863
800
    case ISD::FRINT: {
864
800
      // Replace fp rounding with their X86 specific equivalent so we don't
865
800
      // need 2 sets of patterns.
866
800
      unsigned Imm;
867
800
      switch (N->getOpcode()) {
868
800
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
869
800
      
case ISD::FCEIL: Imm = 0xA; break231
;
870
800
      
case ISD::FFLOOR: Imm = 0x9; break234
;
871
800
      
case ISD::FTRUNC: Imm = 0xB; break130
;
872
800
      
case ISD::FNEARBYINT: Imm = 0xC; break103
;
873
800
      
case ISD::FRINT: Imm = 0x4; break102
;
874
800
      }
875
800
      SDLoc dl(N);
876
800
      SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl,
877
800
                                    N->getValueType(0),
878
800
                                    N->getOperand(0),
879
800
                                    CurDAG->getConstant(Imm, dl, MVT::i8));
880
800
      --I;
881
800
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
882
800
      ++I;
883
800
      CurDAG->DeleteNode(N);
884
800
      continue;
885
800
    }
886
1.09k
    case X86ISD::FANDN:
887
1.09k
    case X86ISD::FAND:
888
1.09k
    case X86ISD::FOR:
889
1.09k
    case X86ISD::FXOR: {
890
1.09k
      // Widen scalar fp logic ops to vector to reduce isel patterns.
891
1.09k
      // FIXME: Can we do this during lowering/combine.
892
1.09k
      MVT VT = N->getSimpleValueType(0);
893
1.09k
      if (VT.isVector() || 
VT == MVT::f128925
)
894
200
        break;
895
890
896
890
      MVT VecVT = VT == MVT::f64 ? 
MVT::v2f64358
:
MVT::v4f32532
;
897
890
      SDLoc dl(N);
898
890
      SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
899
890
                                    N->getOperand(0));
900
890
      SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT,
901
890
                                    N->getOperand(1));
902
890
903
890
      SDValue Res;
904
890
      if (Subtarget->hasSSE2()) {
905
806
        EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger();
906
806
        Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0);
907
806
        Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1);
908
806
        unsigned Opc;
909
806
        switch (N->getOpcode()) {
910
806
        
default: 0
llvm_unreachable0
("Unexpected opcode!");
911
806
        
case X86ISD::FANDN: Opc = X86ISD::ANDNP; break223
;
912
806
        
case X86ISD::FAND: Opc = ISD::AND; break345
;
913
806
        
case X86ISD::FOR: Opc = ISD::OR; break202
;
914
806
        
case X86ISD::FXOR: Opc = ISD::XOR; break36
;
915
806
        }
916
806
        Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1);
917
806
        Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res);
918
806
      } else {
919
84
        Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1);
920
84
      }
921
890
      Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res,
922
890
                            CurDAG->getIntPtrConstant(0, dl));
923
890
      --I;
924
890
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
925
890
      ++I;
926
890
      CurDAG->DeleteNode(N);
927
890
      continue;
928
7.95M
    }
929
7.95M
    }
930
7.95M
931
7.95M
    if (OptLevel != CodeGenOpt::None &&
932
7.95M
        // Only do this when the target can fold the load into the call or
933
7.95M
        // jmp.
934
7.95M
        
!Subtarget->useRetpolineIndirectCalls()7.92M
&&
935
7.95M
        
(7.91M
(7.91M
N->getOpcode() == X86ISD::CALL7.91M
&&
!Subtarget->slowTwoMemOps()133k
) ||
936
7.91M
         
(7.78M
N->getOpcode() == X86ISD::TC_RETURN7.78M
&&
937
7.78M
          
(7.59k
Subtarget->is64Bit()7.59k
||
938
141k
           
!getTargetMachine().isPositionIndependent()594
)))) {
939
141k
      /// Also try moving call address load from outside callseq_start to just
940
141k
      /// before the call to allow it to be folded.
941
141k
      ///
942
141k
      ///     [Load chain]
943
141k
      ///         ^
944
141k
      ///         |
945
141k
      ///       [Load]
946
141k
      ///       ^    ^
947
141k
      ///       |    |
948
141k
      ///      /      \--
949
141k
      ///     /          |
950
141k
      ///[CALLSEQ_START] |
951
141k
      ///     ^          |
952
141k
      ///     |          |
953
141k
      /// [LOAD/C2Reg]   |
954
141k
      ///     |          |
955
141k
      ///      \        /
956
141k
      ///       \      /
957
141k
      ///       [CALL]
958
141k
      bool HasCallSeq = N->getOpcode() == X86ISD::CALL;
959
141k
      SDValue Chain = N->getOperand(0);
960
141k
      SDValue Load  = N->getOperand(1);
961
141k
      if (!isCalleeLoad(Load, Chain, HasCallSeq))
962
139k
        continue;
963
1.40k
      moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
964
1.40k
      ++NumLoadMoved;
965
1.40k
      continue;
966
1.40k
    }
967
7.81M
968
7.81M
    // Lower fpround and fpextend nodes that target the FP stack to be store and
969
7.81M
    // load to the stack.  This is a gross hack.  We would like to simply mark
970
7.81M
    // these as being illegal, but when we do that, legalize produces these when
971
7.81M
    // it expands calls, then expands these in the same legalize pass.  We would
972
7.81M
    // like dag combine to be able to hack on these between the call expansion
973
7.81M
    // and the node legalization.  As such this pass basically does "really
974
7.81M
    // late" legalization of these inline with the X86 isel pass.
975
7.81M
    // FIXME: This should only happen when not compiled with -O0.
976
7.81M
    switch (N->getOpcode()) {
977
7.81M
    
default: continue7.80M
;
978
7.81M
    case ISD::FP_ROUND:
979
1.22k
    case ISD::FP_EXTEND:
980
1.22k
    {
981
1.22k
      MVT SrcVT = N->getOperand(0).getSimpleValueType();
982
1.22k
      MVT DstVT = N->getSimpleValueType(0);
983
1.22k
984
1.22k
      // If any of the sources are vectors, no fp stack involved.
985
1.22k
      if (SrcVT.isVector() || 
DstVT.isVector()1.06k
)
986
162
        continue;
987
1.06k
988
1.06k
      // If the source and destination are SSE registers, then this is a legal
989
1.06k
      // conversion that should not be lowered.
990
1.06k
      const X86TargetLowering *X86Lowering =
991
1.06k
          static_cast<const X86TargetLowering *>(TLI);
992
1.06k
      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
993
1.06k
      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
994
1.06k
      if (SrcIsSSE && 
DstIsSSE746
)
995
394
        continue;
996
670
997
670
      if (!SrcIsSSE && 
!DstIsSSE318
) {
998
45
        // If this is an FPStack extension, it is a noop.
999
45
        if (N->getOpcode() == ISD::FP_EXTEND)
1000
7
          continue;
1001
38
        // If this is a value-preserving FPStack truncation, it is a noop.
1002
38
        if (N->getConstantOperandVal(1))
1003
6
          continue;
1004
657
      }
1005
657
1006
657
      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1007
657
      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1008
657
      // operations.  Based on this, decide what we want to do.
1009
657
      MVT MemVT;
1010
657
      if (N->getOpcode() == ISD::FP_ROUND)
1011
305
        MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1012
352
      else
1013
352
        MemVT = SrcIsSSE ? SrcVT : 
DstVT0
;
1014
657
1015
657
      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1016
657
      SDLoc dl(N);
1017
657
1018
657
      // FIXME: optimize the case where the src/dest is a load or store?
1019
657
1020
657
      SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
1021
657
                                          MemTmp, MachinePointerInfo(), MemVT);
1022
657
      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1023
657
                                          MachinePointerInfo(), MemVT);
1024
657
1025
657
      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1026
657
      // extload we created.  This will cause general havok on the dag because
1027
657
      // anything below the conversion could be folded into other existing nodes.
1028
657
      // To avoid invalidating 'I', back it up to the convert node.
1029
657
      --I;
1030
657
      CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
1031
657
      break;
1032
657
    }
1033
657
1034
657
    //The sequence of events for lowering STRICT_FP versions of these nodes requires
1035
657
    //dealing with the chain differently, as there is already a preexisting chain.
1036
657
    case ISD::STRICT_FP_ROUND:
1037
42
    case ISD::STRICT_FP_EXTEND:
1038
42
    {
1039
42
      MVT SrcVT = N->getOperand(1).getSimpleValueType();
1040
42
      MVT DstVT = N->getSimpleValueType(0);
1041
42
1042
42
      // If any of the sources are vectors, no fp stack involved.
1043
42
      if (SrcVT.isVector() || 
DstVT.isVector()40
)
1044
2
        continue;
1045
40
1046
40
      // If the source and destination are SSE registers, then this is a legal
1047
40
      // conversion that should not be lowered.
1048
40
      const X86TargetLowering *X86Lowering =
1049
40
          static_cast<const X86TargetLowering *>(TLI);
1050
40
      bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT);
1051
40
      bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT);
1052
40
      if (SrcIsSSE && 
DstIsSSE38
)
1053
36
        continue;
1054
4
1055
4
      if (!SrcIsSSE && 
!DstIsSSE2
) {
1056
0
        // If this is an FPStack extension, it is a noop.
1057
0
        if (N->getOpcode() == ISD::STRICT_FP_EXTEND)
1058
0
          continue;
1059
0
        // If this is a value-preserving FPStack truncation, it is a noop.
1060
0
        if (N->getConstantOperandVal(2))
1061
0
          continue;
1062
4
      }
1063
4
1064
4
      // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
1065
4
      // FPStack has extload and truncstore.  SSE can fold direct loads into other
1066
4
      // operations.  Based on this, decide what we want to do.
1067
4
      MVT MemVT;
1068
4
      if (N->getOpcode() == ISD::STRICT_FP_ROUND)
1069
2
        MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
1070
2
      else
1071
2
        MemVT = SrcIsSSE ? SrcVT : 
DstVT0
;
1072
4
1073
4
      SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
1074
4
      SDLoc dl(N);
1075
4
1076
4
      // FIXME: optimize the case where the src/dest is a load or store?
1077
4
1078
4
      //Since the operation is StrictFP, use the preexisting chain.
1079
4
      SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
1080
4
                                MemTmp, MachinePointerInfo(), MemVT);
1081
4
      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
1082
4
                                          MachinePointerInfo(), MemVT);
1083
4
1084
4
      // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
1085
4
      // extload we created.  This will cause general havok on the dag because
1086
4
      // anything below the conversion could be folded into other existing nodes.
1087
4
      // To avoid invalidating 'I', back it up to the convert node.
1088
4
      --I;
1089
4
      CurDAG->ReplaceAllUsesWith(N, Result.getNode());
1090
4
      break;
1091
4
    }
1092
661
    }
1093
661
1094
661
1095
661
    // Now that we did that, the node is dead.  Increment the iterator to the
1096
661
    // next node to process, then delete N.
1097
661
    ++I;
1098
661
    CurDAG->DeleteNode(N);
1099
661
  }
1100
384k
1101
384k
  // The load+call transform above can leave some dead nodes in the graph. Make
1102
384k
  // sure we remove them. Its possible some of the other transforms do to so
1103
384k
  // just remove dead nodes unconditionally.
1104
384k
  CurDAG->RemoveDeadNodes();
1105
384k
}
1106
1107
// Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
1108
2.05M
bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) {
1109
2.05M
  unsigned Opc = N->getMachineOpcode();
1110
2.05M
  if (Opc != X86::MOVZX32rr8 && 
Opc != X86::MOVSX32rr82.04M
&&
1111
2.05M
      
Opc != X86::MOVSX64rr82.04M
)
1112
2.04M
    return false;
1113
7.07k
1114
7.07k
  SDValue N0 = N->getOperand(0);
1115
7.07k
1116
7.07k
  // We need to be extracting the lower bit of an extend.
1117
7.07k
  if (!N0.isMachineOpcode() ||
1118
7.07k
      
N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG5.28k
||
1119
7.07k
      
N0.getConstantOperandVal(1) != X86::sub_8bit780
)
1120
6.29k
    return false;
1121
782
1122
782
  // We're looking for either a movsx or movzx to match the original opcode.
1123
782
  unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? 
X86::MOVZX32rr8_NOREX560
1124
782
                                                : 
X86::MOVSX32rr8_NOREX222
;
1125
782
  SDValue N00 = N0.getOperand(0);
1126
782
  if (!N00.isMachineOpcode() || 
N00.getMachineOpcode() != ExpectedOpc421
)
1127
692
    return false;
1128
90
1129
90
  if (Opc == X86::MOVSX64rr8) {
1130
1
    // If we had a sign extend from 8 to 64 bits. We still need to go from 32
1131
1
    // to 64.
1132
1
    MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N),
1133
1
                                                   MVT::i64, N00);
1134
1
    ReplaceUses(N, Extend);
1135
89
  } else {
1136
89
    // Ok we can drop this extend and just use the original extend.
1137
89
    ReplaceUses(N, N00.getNode());
1138
89
  }
1139
90
1140
90
  return true;
1141
90
}
1142
1143
384k
void X86DAGToDAGISel::PostprocessISelDAG() {
1144
384k
  // Skip peepholes at -O0.
1145
384k
  if (TM.getOptLevel() == CodeGenOpt::None)
1146
2.68k
    return;
1147
382k
1148
382k
  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
1149
382k
1150
382k
  bool MadeChange = false;
1151
8.77M
  while (Position != CurDAG->allnodes_begin()) {
1152
8.39M
    SDNode *N = &*--Position;
1153
8.39M
    // Skip dead nodes and any non-machine opcodes.
1154
8.39M
    if (N->use_empty() || 
!N->isMachineOpcode()7.91M
)
1155
6.33M
      continue;
1156
2.05M
1157
2.05M
    if (tryOptimizeRem8Extend(N)) {
1158
88
      MadeChange = true;
1159
88
      continue;
1160
88
    }
1161
2.05M
1162
2.05M
    // Look for a TESTrr+ANDrr pattern where both operands of the test are
1163
2.05M
    // the same. Rewrite to remove the AND.
1164
2.05M
    unsigned Opc = N->getMachineOpcode();
1165
2.05M
    if ((Opc == X86::TEST8rr || 
Opc == X86::TEST16rr2.04M
||
1166
2.05M
         
Opc == X86::TEST32rr2.03M
||
Opc == X86::TEST64rr2.02M
) &&
1167
2.05M
        
N->getOperand(0) == N->getOperand(1)54.6k
&&
1168
2.05M
        
N->isOnlyUserOf(N->getOperand(0).getNode())54.6k
&&
1169
2.05M
        
N->getOperand(0).isMachineOpcode()21.1k
) {
1170
6.46k
      SDValue And = N->getOperand(0);
1171
6.46k
      unsigned N0Opc = And.getMachineOpcode();
1172
6.46k
      if (N0Opc == X86::AND8rr || 
N0Opc == X86::AND16rr6.35k
||
1173
6.46k
          
N0Opc == X86::AND32rr6.33k
||
N0Opc == X86::AND64rr5.98k
) {
1174
3.76k
        MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N),
1175
3.76k
                                                     MVT::i32,
1176
3.76k
                                                     And.getOperand(0),
1177
3.76k
                                                     And.getOperand(1));
1178
3.76k
        ReplaceUses(N, Test);
1179
3.76k
        MadeChange = true;
1180
3.76k
        continue;
1181
3.76k
      }
1182
2.69k
      if (N0Opc == X86::AND8rm || 
N0Opc == X86::AND16rm2.66k
||
1183
2.69k
          
N0Opc == X86::AND32rm2.64k
||
N0Opc == X86::AND64rm2.61k
) {
1184
138
        unsigned NewOpc;
1185
138
        switch (N0Opc) {
1186
138
        
case X86::AND8rm: NewOpc = X86::TEST8mr; break31
;
1187
138
        
case X86::AND16rm: NewOpc = X86::TEST16mr; break20
;
1188
138
        
case X86::AND32rm: NewOpc = X86::TEST32mr; break38
;
1189
138
        
case X86::AND64rm: NewOpc = X86::TEST64mr; break49
;
1190
138
        }
1191
138
1192
138
        // Need to swap the memory and register operand.
1193
138
        SDValue Ops[] = { And.getOperand(1),
1194
138
                          And.getOperand(2),
1195
138
                          And.getOperand(3),
1196
138
                          And.getOperand(4),
1197
138
                          And.getOperand(5),
1198
138
                          And.getOperand(0),
1199
138
                          And.getOperand(6)  /* Chain */ };
1200
138
        MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1201
138
                                                     MVT::i32, MVT::Other, Ops);
1202
138
        ReplaceUses(N, Test);
1203
138
        MadeChange = true;
1204
138
        continue;
1205
138
      }
1206
2.69k
    }
1207
2.04M
1208
2.04M
    // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is
1209
2.04M
    // used. We're doing this late so we can prefer to fold the AND into masked
1210
2.04M
    // comparisons. Doing that can be better for the live range of the mask
1211
2.04M
    // register.
1212
2.04M
    if ((Opc == X86::KORTESTBrr || 
Opc == X86::KORTESTWrr2.04M
||
1213
2.04M
         
Opc == X86::KORTESTDrr2.04M
||
Opc == X86::KORTESTQrr2.04M
) &&
1214
2.04M
        
N->getOperand(0) == N->getOperand(1)165
&&
1215
2.04M
        
N->isOnlyUserOf(N->getOperand(0).getNode())156
&&
1216
2.04M
        
N->getOperand(0).isMachineOpcode()156
&&
1217
2.04M
        
onlyUsesZeroFlag(SDValue(N, 0))156
) {
1218
87
      SDValue And = N->getOperand(0);
1219
87
      unsigned N0Opc = And.getMachineOpcode();
1220
87
      // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other
1221
87
      // KAND instructions and KTEST use the same ISA feature.
1222
87
      if (N0Opc == X86::KANDBrr ||
1223
87
          
(81
N0Opc == X86::KANDWrr81
&&
Subtarget->hasDQI()6
) ||
1224
87
          
N0Opc == X86::KANDDrr78
||
N0Opc == X86::KANDQrr75
) {
1225
14
        unsigned NewOpc;
1226
14
        switch (Opc) {
1227
14
        
default: 0
llvm_unreachable0
("Unexpected opcode!");
1228
14
        
case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break6
;
1229
14
        
case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break3
;
1230
14
        
case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break3
;
1231
14
        
case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break2
;
1232
14
        }
1233
14
        MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N),
1234
14
                                                      MVT::i32,
1235
14
                                                      And.getOperand(0),
1236
14
                                                      And.getOperand(1));
1237
14
        ReplaceUses(N, KTest);
1238
14
        MadeChange = true;
1239
14
        continue;
1240
14
      }
1241
87
    }
1242
2.04M
1243
2.04M
    // Attempt to remove vectors moves that were inserted to zero upper bits.
1244
2.04M
    if (Opc != TargetOpcode::SUBREG_TO_REG)
1245
2.01M
      continue;
1246
37.9k
1247
37.9k
    unsigned SubRegIdx = N->getConstantOperandVal(2);
1248
37.9k
    if (SubRegIdx != X86::sub_xmm && 
SubRegIdx != X86::sub_ymm37.6k
)
1249
37.6k
      continue;
1250
287
1251
287
    SDValue Move = N->getOperand(1);
1252
287
    if (!Move.isMachineOpcode())
1253
0
      continue;
1254
287
1255
287
    // Make sure its one of the move opcodes we recognize.
1256
287
    switch (Move.getMachineOpcode()) {
1257
287
    default:
1258
118
      continue;
1259
287
    
case X86::VMOVAPDrr: 166
case X86::VMOVUPDrr:
1260
166
    case X86::VMOVAPSrr:       case X86::VMOVUPSrr:
1261
166
    case X86::VMOVDQArr:       case X86::VMOVDQUrr:
1262
166
    case X86::VMOVAPDYrr:      case X86::VMOVUPDYrr:
1263
166
    case X86::VMOVAPSYrr:      case X86::VMOVUPSYrr:
1264
166
    case X86::VMOVDQAYrr:      case X86::VMOVDQUYrr:
1265
166
    case X86::VMOVAPDZ128rr:   case X86::VMOVUPDZ128rr:
1266
166
    case X86::VMOVAPSZ128rr:   case X86::VMOVUPSZ128rr:
1267
166
    case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr:
1268
166
    case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr:
1269
166
    case X86::VMOVAPDZ256rr:   case X86::VMOVUPDZ256rr:
1270
166
    case X86::VMOVAPSZ256rr:   case X86::VMOVUPSZ256rr:
1271
166
    case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr:
1272
166
    case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr:
1273
166
      break;
1274
166
    }
1275
166
1276
166
    SDValue In = Move.getOperand(0);
1277
166
    if (!In.isMachineOpcode() ||
1278
166
        
In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END124
)
1279
59
      continue;
1280
107
1281
107
    // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers
1282
107
    // the SHA instructions which use a legacy encoding.
1283
107
    uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags;
1284
107
    if ((TSFlags & X86II::EncodingMask) != X86II::VEX &&
1285
107
        
(TSFlags & X86II::EncodingMask) != X86II::EVEX30
&&
1286
107
        
(TSFlags & X86II::EncodingMask) != X86II::XOP1
)
1287
1
      continue;
1288
106
1289
106
    // Producing instruction is another vector instruction. We can drop the
1290
106
    // move.
1291
106
    CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2));
1292
106
    MadeChange = true;
1293
106
  }
1294
382k
1295
382k
  
if (382k
MadeChange382k
)
1296
3.98k
    CurDAG->RemoveDeadNodes();
1297
382k
}
1298
1299
1300
/// Emit any code that needs to be executed only in the main function.
1301
571
void X86DAGToDAGISel::emitSpecialCodeForMain() {
1302
571
  if (Subtarget->isTargetCygMing()) {
1303
19
    TargetLowering::ArgListTy Args;
1304
19
    auto &DL = CurDAG->getDataLayout();
1305
19
1306
19
    TargetLowering::CallLoweringInfo CLI(*CurDAG);
1307
19
    CLI.setChain(CurDAG->getRoot())
1308
19
        .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()),
1309
19
                   CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)),
1310
19
                   std::move(Args));
1311
19
    const TargetLowering &TLI = CurDAG->getTargetLoweringInfo();
1312
19
    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
1313
19
    CurDAG->setRoot(Result.second);
1314
19
  }
1315
571
}
1316
1317
134k
void X86DAGToDAGISel::EmitFunctionEntryCode() {
1318
134k
  // If this is main, emit special code for main.
1319
134k
  const Function &F = MF->getFunction();
1320
134k
  if (F.hasExternalLinkage() && 
F.getName() == "main"129k
)
1321
571
    emitSpecialCodeForMain();
1322
134k
}
1323
1324
131k
static bool isDispSafeForFrameIndex(int64_t Val) {
1325
131k
  // On 64-bit platforms, we can run into an issue where a frame index
1326
131k
  // includes a displacement that, when added to the explicit displacement,
1327
131k
  // will overflow the displacement field. Assuming that the frame index
1328
131k
  // displacement fits into a 31-bit integer  (which is only slightly more
1329
131k
  // aggressive than the current fundamental assumption that it fits into
1330
131k
  // a 32-bit integer), a 31-bit disp should always be safe.
1331
131k
  return isInt<31>(Val);
1332
131k
}
1333
1334
bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
1335
423k
                                            X86ISelAddressMode &AM) {
1336
423k
  // If there's no offset to fold, we don't need to do any work.
1337
423k
  if (Offset == 0)
1338
121k
    return false;
1339
301k
1340
301k
  // Cannot combine ExternalSymbol displacements with integer offsets.
1341
301k
  if (AM.ES || AM.MCSym)
1342
8
    return true;
1343
301k
1344
301k
  int64_t Val = AM.Disp + Offset;
1345
301k
  CodeModel::Model M = TM.getCodeModel();
1346
301k
  if (Subtarget->is64Bit()) {
1347
230k
    if (!X86::isOffsetSuitableForCodeModel(Val, M,
1348
230k
                                           AM.hasSymbolicDisplacement()))
1349
4.29k
      return true;
1350
225k
    // In addition to the checks required for a register base, check that
1351
225k
    // we do not try to use an unsafe Disp with a frame index.
1352
225k
    if (AM.BaseType == X86ISelAddressMode::FrameIndexBase &&
1353
225k
        
!isDispSafeForFrameIndex(Val)39.6k
)
1354
1
      return true;
1355
296k
  }
1356
296k
  AM.Disp = Val;
1357
296k
  return false;
1358
296k
1359
296k
}
1360
1361
72.2k
bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
1362
72.2k
  SDValue Address = N->getOperand(1);
1363
72.2k
1364
72.2k
  // load gs:0 -> GS segment register.
1365
72.2k
  // load fs:0 -> FS segment register.
1366
72.2k
  //
1367
72.2k
  // This optimization is valid because the GNU TLS model defines that
1368
72.2k
  // gs:0 (or fs:0 on X86-64) contains its own address.
1369
72.2k
  // For more information see http://people.redhat.com/drepper/tls.pdf
1370
72.2k
  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
1371
269
    if (C->getSExtValue() == 0 && 
AM.Segment.getNode() == nullptr189
&&
1372
269
        
!IndirectTlsSegRefs114
&&
1373
269
        
(106
Subtarget->isTargetGlibc()106
||
Subtarget->isTargetAndroid()7
||
1374
106
         
Subtarget->isTargetFuchsia()7
))
1375
99
      switch (N->getPointerInfo().getAddrSpace()) {
1376
99
      case 256:
1377
38
        AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
1378
38
        return false;
1379
99
      case 257:
1380
59
        AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
1381
59
        return false;
1382
72.1k
      // Address space 258 is not handled here, because it is not used to
1383
72.1k
      // address TLS areas.
1384
72.1k
      }
1385
72.1k
1386
72.1k
  return true;
1387
72.1k
}
1388
1389
/// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing
1390
/// mode. These wrap things that will resolve down into a symbol reference.
1391
/// If no match is possible, this returns true, otherwise it returns false.
1392
121k
bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
1393
121k
  // If the addressing mode already has a symbol as the displacement, we can
1394
121k
  // never match another symbol.
1395
121k
  if (AM.hasSymbolicDisplacement())
1396
0
    return true;
1397
121k
1398
121k
  bool IsRIPRelTLS = false;
1399
121k
  bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP;
1400
121k
  if (IsRIPRel) {
1401
74.8k
    SDValue Val = N.getOperand(0);
1402
74.8k
    if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
1403
398
      IsRIPRelTLS = true;
1404
74.8k
  }
1405
121k
1406
121k
  // We can't use an addressing mode in the 64-bit large code model.
1407
121k
  // Global TLS addressing is an exception. In the medium code model,
1408
121k
  // we use can use a mode when RIP wrappers are present.
1409
121k
  // That signifies access to globals that are known to be "near",
1410
121k
  // such as the GOT itself.
1411
121k
  CodeModel::Model M = TM.getCodeModel();
1412
121k
  if (Subtarget->is64Bit() &&
1413
121k
      
(102k
(102k
M == CodeModel::Large102k
&&
!IsRIPRelTLS172
) ||
1414
102k
       
(102k
M == CodeModel::Medium102k
&&
!IsRIPRel11
)))
1415
176
    return true;
1416
121k
1417
121k
  // Base and index reg must be 0 in order to use %rip as base.
1418
121k
  if (IsRIPRel && 
AM.hasBaseOrIndexReg()74.8k
)
1419
2.38k
    return true;
1420
119k
1421
119k
  // Make a local copy in case we can't do this fold.
1422
119k
  X86ISelAddressMode Backup = AM;
1423
119k
1424
119k
  int64_t Offset = 0;
1425
119k
  SDValue N0 = N.getOperand(0);
1426
119k
  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
1427
87.1k
    AM.GV = G->getGlobal();
1428
87.1k
    AM.SymbolFlags = G->getTargetFlags();
1429
87.1k
    Offset = G->getOffset();
1430
87.1k
  } else 
if (ConstantPoolSDNode *31.9k
CP31.9k
= dyn_cast<ConstantPoolSDNode>(N0)) {
1431
31.5k
    AM.CP = CP->getConstVal();
1432
31.5k
    AM.Align = CP->getAlignment();
1433
31.5k
    AM.SymbolFlags = CP->getTargetFlags();
1434
31.5k
    Offset = CP->getOffset();
1435
31.5k
  } else 
if (ExternalSymbolSDNode *442
S442
= dyn_cast<ExternalSymbolSDNode>(N0)) {
1436
96
    AM.ES = S->getSymbol();
1437
96
    AM.SymbolFlags = S->getTargetFlags();
1438
346
  } else if (auto *S = dyn_cast<MCSymbolSDNode>(N0)) {
1439
1
    AM.MCSym = S->getMCSymbol();
1440
345
  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
1441
327
    AM.JT = J->getIndex();
1442
327
    AM.SymbolFlags = J->getTargetFlags();
1443
327
  } else 
if (BlockAddressSDNode *18
BA18
= dyn_cast<BlockAddressSDNode>(N0)) {
1444
18
    AM.BlockAddr = BA->getBlockAddress();
1445
18
    AM.SymbolFlags = BA->getTargetFlags();
1446
18
    Offset = BA->getOffset();
1447
18
  } else
1448
18
    
llvm_unreachable0
("Unhandled symbol reference node.");
1449
119k
1450
119k
  if (foldOffsetIntoAddress(Offset, AM)) {
1451
0
    AM = Backup;
1452
0
    return true;
1453
0
  }
1454
119k
1455
119k
  if (IsRIPRel)
1456
72.4k
    AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
1457
119k
1458
119k
  // Commit the changes now that we know this fold is safe.
1459
119k
  return false;
1460
119k
}
1461
1462
/// Add the specified node to the specified addressing mode, returning true if
1463
/// it cannot be done. This just pattern matches for the addressing mode.
1464
676k
bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) {
1465
676k
  if (matchAddressRecursively(N, AM, 0))
1466
0
    return true;
1467
676k
1468
676k
  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
1469
676k
  // a smaller encoding and avoids a scaled-index.
1470
676k
  if (AM.Scale == 2 &&
1471
676k
      
AM.BaseType == X86ISelAddressMode::RegBase3.29k
&&
1472
676k
      
AM.Base_Reg.getNode() == nullptr3.09k
) {
1473
1.05k
    AM.Base_Reg = AM.IndexReg;
1474
1.05k
    AM.Scale = 1;
1475
1.05k
  }
1476
676k
1477
676k
  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
1478
676k
  // because it has a smaller encoding.
1479
676k
  // TODO: Which other code models can use this?
1480
676k
  switch (TM.getCodeModel()) {
1481
676k
    
default: break413
;
1482
676k
    case CodeModel::Small:
1483
676k
    case CodeModel::Kernel:
1484
676k
      if (Subtarget->is64Bit() &&
1485
676k
          
AM.Scale == 1505k
&&
1486
676k
          
AM.BaseType == X86ISelAddressMode::RegBase478k
&&
1487
676k
          
AM.Base_Reg.getNode() == nullptr388k
&&
1488
676k
          
AM.IndexReg.getNode() == nullptr27.3k
&&
1489
676k
          
AM.SymbolFlags == X86II::MO_NO_FLAG27.3k
&&
1490
676k
          
AM.hasSymbolicDisplacement()27.2k
)
1491
26.6k
        AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
1492
676k
      break;
1493
676k
  }
1494
676k
1495
676k
  return false;
1496
676k
}
1497
1498
bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
1499
409k
                               unsigned Depth) {
1500
409k
  // Add an artificial use to this node so that we can keep track of
1501
409k
  // it if it gets CSE'd with a different node.
1502
409k
  HandleSDNode Handle(N);
1503
409k
1504
409k
  X86ISelAddressMode Backup = AM;
1505
409k
  if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
1506
409k
      
!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)393k
)
1507
372k
    return false;
1508
37.1k
  AM = Backup;
1509
37.1k
1510
37.1k
  // Try again after commuting the operands.
1511
37.1k
  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
1512
37.1k
      
!matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1)21.6k
)
1513
6.73k
    return false;
1514
30.4k
  AM = Backup;
1515
30.4k
1516
30.4k
  // If we couldn't fold both operands into the address at the same time,
1517
30.4k
  // see if we can just put each operand into a register and fold at least
1518
30.4k
  // the add.
1519
30.4k
  if (AM.BaseType == X86ISelAddressMode::RegBase &&
1520
30.4k
      
!AM.Base_Reg.getNode()30.4k
&&
1521
30.4k
      
!AM.IndexReg.getNode()1.79k
) {
1522
140
    N = Handle.getValue();
1523
140
    AM.Base_Reg = N.getOperand(0);
1524
140
    AM.IndexReg = N.getOperand(1);
1525
140
    AM.Scale = 1;
1526
140
    return false;
1527
140
  }
1528
30.3k
  N = Handle.getValue();
1529
30.3k
  return true;
1530
30.3k
}
1531
1532
// Insert a node into the DAG at least before the Pos node's position. This
1533
// will reposition the node as needed, and will assign it a node ID that is <=
1534
// the Pos node's ID. Note that this does *not* preserve the uniqueness of node
1535
// IDs! The selection DAG must no longer depend on their uniqueness when this
1536
// is used.
1537
14.0k
static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) {
1538
14.0k
  if (N->getNodeId() == -1 ||
1539
14.0k
      (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) >
1540
13.0k
       SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) {
1541
13.0k
    DAG.RepositionNode(Pos->getIterator(), N.getNode());
1542
13.0k
    // Mark Node as invalid for pruning as after this it may be a successor to a
1543
13.0k
    // selected node but otherwise be in the same position of Pos.
1544
13.0k
    // Conservatively mark it with the same -abs(Id) to assure node id
1545
13.0k
    // invariant is preserved.
1546
13.0k
    N->setNodeId(Pos->getNodeId());
1547
13.0k
    SelectionDAGISel::InvalidateNodeId(N.getNode());
1548
13.0k
  }
1549
14.0k
}
1550
1551
// Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if
1552
// safe. This allows us to convert the shift and and into an h-register
1553
// extract and a scaled index. Returns false if the simplification is
1554
// performed.
1555
static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N,
1556
                                      uint64_t Mask,
1557
                                      SDValue Shift, SDValue X,
1558
3.10k
                                      X86ISelAddressMode &AM) {
1559
3.10k
  if (Shift.getOpcode() != ISD::SRL ||
1560
3.10k
      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1561
3.10k
      !Shift.hasOneUse())
1562
60
    return true;
1563
3.04k
1564
3.04k
  int ScaleLog = 8 - Shift.getConstantOperandVal(1);
1565
3.04k
  if (ScaleLog <= 0 || 
ScaleLog >= 41.00k
||
1566
3.04k
      
Mask != (0xffu << ScaleLog)356
)
1567
3.03k
    return true;
1568
14
1569
14
  MVT VT = N.getSimpleValueType();
1570
14
  SDLoc DL(N);
1571
14
  SDValue Eight = DAG.getConstant(8, DL, MVT::i8);
1572
14
  SDValue NewMask = DAG.getConstant(0xff, DL, VT);
1573
14
  SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight);
1574
14
  SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask);
1575
14
  SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8);
1576
14
  SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount);
1577
14
1578
14
  // Insert the new nodes into the topological ordering. We must do this in
1579
14
  // a valid topological ordering as nothing is going to go back and re-sort
1580
14
  // these nodes. We continually insert before 'N' in sequence as this is
1581
14
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1582
14
  // hierarchy left to express.
1583
14
  insertDAGNode(DAG, N, Eight);
1584
14
  insertDAGNode(DAG, N, Srl);
1585
14
  insertDAGNode(DAG, N, NewMask);
1586
14
  insertDAGNode(DAG, N, And);
1587
14
  insertDAGNode(DAG, N, ShlCount);
1588
14
  insertDAGNode(DAG, N, Shl);
1589
14
  DAG.ReplaceAllUsesWith(N, Shl);
1590
14
  DAG.RemoveDeadNode(N.getNode());
1591
14
  AM.IndexReg = And;
1592
14
  AM.Scale = (1 << ScaleLog);
1593
14
  return false;
1594
14
}
1595
1596
// Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this
1597
// allows us to fold the shift into this addressing mode. Returns false if the
1598
// transform succeeded.
1599
static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N,
1600
5.47k
                                        X86ISelAddressMode &AM) {
1601
5.47k
  SDValue Shift = N.getOperand(0);
1602
5.47k
1603
5.47k
  // Use a signed mask so that shifting right will insert sign bits. These
1604
5.47k
  // bits will be removed when we shift the result left so it doesn't matter
1605
5.47k
  // what we use. This might allow a smaller immediate encoding.
1606
5.47k
  int64_t Mask = cast<ConstantSDNode>(N->getOperand(1))->getSExtValue();
1607
5.47k
1608
5.47k
  // If we have an any_extend feeding the AND, look through it to see if there
1609
5.47k
  // is a shift behind it. But only if the AND doesn't use the extended bits.
1610
5.47k
  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
1611
5.47k
  bool FoundAnyExtend = false;
1612
5.47k
  if (Shift.getOpcode() == ISD::ANY_EXTEND && 
Shift.hasOneUse()382
&&
1613
5.47k
      
Shift.getOperand(0).getSimpleValueType() == MVT::i32366
&&
1614
5.47k
      
isUInt<32>(Mask)285
) {
1615
285
    FoundAnyExtend = true;
1616
285
    Shift = Shift.getOperand(0);
1617
285
  }
1618
5.47k
1619
5.47k
  if (Shift.getOpcode() != ISD::SHL ||
1620
5.47k
      
!isa<ConstantSDNode>(Shift.getOperand(1))559
)
1621
4.91k
    return true;
1622
559
1623
559
  SDValue X = Shift.getOperand(0);
1624
559
1625
559
  // Not likely to be profitable if either the AND or SHIFT node has more
1626
559
  // than one use (unless all uses are for address computation). Besides,
1627
559
  // isel mechanism requires their node ids to be reused.
1628
559
  if (!N.hasOneUse() || 
!Shift.hasOneUse()538
)
1629
21
    return true;
1630
538
1631
538
  // Verify that the shift amount is something we can fold.
1632
538
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1633
538
  if (ShiftAmt != 1 && 
ShiftAmt != 2525
&&
ShiftAmt != 3498
)
1634
480
    return true;
1635
58
1636
58
  MVT VT = N.getSimpleValueType();
1637
58
  SDLoc DL(N);
1638
58
  if (FoundAnyExtend) {
1639
20
    SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X);
1640
20
    insertDAGNode(DAG, N, NewX);
1641
20
    X = NewX;
1642
20
  }
1643
58
1644
58
  SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT);
1645
58
  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask);
1646
58
  SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1));
1647
58
1648
58
  // Insert the new nodes into the topological ordering. We must do this in
1649
58
  // a valid topological ordering as nothing is going to go back and re-sort
1650
58
  // these nodes. We continually insert before 'N' in sequence as this is
1651
58
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1652
58
  // hierarchy left to express.
1653
58
  insertDAGNode(DAG, N, NewMask);
1654
58
  insertDAGNode(DAG, N, NewAnd);
1655
58
  insertDAGNode(DAG, N, NewShift);
1656
58
  DAG.ReplaceAllUsesWith(N, NewShift);
1657
58
  DAG.RemoveDeadNode(N.getNode());
1658
58
1659
58
  AM.Scale = 1 << ShiftAmt;
1660
58
  AM.IndexReg = NewAnd;
1661
58
  return false;
1662
58
}
1663
1664
// Implement some heroics to detect shifts of masked values where the mask can
1665
// be replaced by extending the shift and undoing that in the addressing mode
1666
// scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and
1667
// (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in
1668
// the addressing mode. This results in code such as:
1669
//
1670
//   int f(short *y, int *lookup_table) {
1671
//     ...
1672
//     return *y + lookup_table[*y >> 11];
1673
//   }
1674
//
1675
// Turning into:
1676
//   movzwl (%rdi), %eax
1677
//   movl %eax, %ecx
1678
//   shrl $11, %ecx
1679
//   addl (%rsi,%rcx,4), %eax
1680
//
1681
// Instead of:
1682
//   movzwl (%rdi), %eax
1683
//   movl %eax, %ecx
1684
//   shrl $9, %ecx
1685
//   andl $124, %rcx
1686
//   addl (%rsi,%rcx), %eax
1687
//
1688
// Note that this function assumes the mask is provided as a mask *after* the
1689
// value is shifted. The input chain may or may not match that, but computing
1690
// such a mask is trivial.
1691
static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
1692
                                    uint64_t Mask,
1693
                                    SDValue Shift, SDValue X,
1694
3.30k
                                    X86ISelAddressMode &AM) {
1695
3.30k
  if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() ||
1696
3.30k
      
!isa<ConstantSDNode>(Shift.getOperand(1))3.24k
)
1697
60
    return true;
1698
3.24k
1699
3.24k
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1700
3.24k
  unsigned MaskLZ = countLeadingZeros(Mask);
1701
3.24k
  unsigned MaskTZ = countTrailingZeros(Mask);
1702
3.24k
1703
3.24k
  // The amount of shift we're trying to fit into the addressing mode is taken
1704
3.24k
  // from the trailing zeros of the mask.
1705
3.24k
  unsigned AMShiftAmt = MaskTZ;
1706
3.24k
1707
3.24k
  // There is nothing we can do here unless the mask is removing some bits.
1708
3.24k
  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1709
3.24k
  if (AMShiftAmt <= 0 || 
AMShiftAmt > 32.45k
)
return true876
;
1710
2.36k
1711
2.36k
  // We also need to ensure that mask is a continuous run of bits.
1712
2.36k
  if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) 
return true91
;
1713
2.27k
1714
2.27k
  // Scale the leading zero count down based on the actual size of the value.
1715
2.27k
  // Also scale it down based on the size of the shift.
1716
2.27k
  unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt;
1717
2.27k
  if (MaskLZ < ScaleDown)
1718
0
    return true;
1719
2.27k
  MaskLZ -= ScaleDown;
1720
2.27k
1721
2.27k
  // The final check is to ensure that any masked out high bits of X are
1722
2.27k
  // already known to be zero. Otherwise, the mask has a semantic impact
1723
2.27k
  // other than masking out a couple of low bits. Unfortunately, because of
1724
2.27k
  // the mask, zero extensions will be removed from operands in some cases.
1725
2.27k
  // This code works extra hard to look through extensions because we can
1726
2.27k
  // replace them with zero extensions cheaply if necessary.
1727
2.27k
  bool ReplacingAnyExtend = false;
1728
2.27k
  if (X.getOpcode() == ISD::ANY_EXTEND) {
1729
0
    unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() -
1730
0
                          X.getOperand(0).getSimpleValueType().getSizeInBits();
1731
0
    // Assume that we'll replace the any-extend with a zero-extend, and
1732
0
    // narrow the search to the extended value.
1733
0
    X = X.getOperand(0);
1734
0
    MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits;
1735
0
    ReplacingAnyExtend = true;
1736
0
  }
1737
2.27k
  APInt MaskedHighBits =
1738
2.27k
    APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
1739
2.27k
  KnownBits Known = DAG.computeKnownBits(X);
1740
2.27k
  if (MaskedHighBits != Known.Zero) 
return true72
;
1741
2.20k
1742
2.20k
  // We've identified a pattern that can be transformed into a single shift
1743
2.20k
  // and an addressing mode. Make it so.
1744
2.20k
  MVT VT = N.getSimpleValueType();
1745
2.20k
  if (ReplacingAnyExtend) {
1746
0
    assert(X.getValueType() != VT);
1747
0
    // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND.
1748
0
    SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X);
1749
0
    insertDAGNode(DAG, N, NewX);
1750
0
    X = NewX;
1751
0
  }
1752
2.20k
  SDLoc DL(N);
1753
2.20k
  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1754
2.20k
  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1755
2.20k
  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1756
2.20k
  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt);
1757
2.20k
1758
2.20k
  // Insert the new nodes into the topological ordering. We must do this in
1759
2.20k
  // a valid topological ordering as nothing is going to go back and re-sort
1760
2.20k
  // these nodes. We continually insert before 'N' in sequence as this is
1761
2.20k
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1762
2.20k
  // hierarchy left to express.
1763
2.20k
  insertDAGNode(DAG, N, NewSRLAmt);
1764
2.20k
  insertDAGNode(DAG, N, NewSRL);
1765
2.20k
  insertDAGNode(DAG, N, NewSHLAmt);
1766
2.20k
  insertDAGNode(DAG, N, NewSHL);
1767
2.20k
  DAG.ReplaceAllUsesWith(N, NewSHL);
1768
2.20k
  DAG.RemoveDeadNode(N.getNode());
1769
2.20k
1770
2.20k
  AM.Scale = 1 << AMShiftAmt;
1771
2.20k
  AM.IndexReg = NewSRL;
1772
2.20k
  return false;
1773
2.20k
}
1774
1775
// Transform "(X >> SHIFT) & (MASK << C1)" to
1776
// "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be
1777
// matched to a BEXTR later. Returns false if the simplification is performed.
1778
static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
1779
                                   uint64_t Mask,
1780
                                   SDValue Shift, SDValue X,
1781
                                   X86ISelAddressMode &AM,
1782
888
                                   const X86Subtarget &Subtarget) {
1783
888
  if (Shift.getOpcode() != ISD::SRL ||
1784
888
      !isa<ConstantSDNode>(Shift.getOperand(1)) ||
1785
888
      !Shift.hasOneUse() || 
!N.hasOneUse()828
)
1786
136
    return true;
1787
752
1788
752
  // Only do this if BEXTR will be matched by matchBEXTRFromAndImm.
1789
752
  if (!Subtarget.hasTBM() &&
1790
752
      
!(748
Subtarget.hasBMI()748
&&
Subtarget.hasFastBEXTR()118
))
1791
744
    return true;
1792
8
1793
8
  // We need to ensure that mask is a continuous run of bits.
1794
8
  if (!isShiftedMask_64(Mask)) 
return true0
;
1795
8
1796
8
  unsigned ShiftAmt = Shift.getConstantOperandVal(1);
1797
8
1798
8
  // The amount of shift we're trying to fit into the addressing mode is taken
1799
8
  // from the trailing zeros of the mask.
1800
8
  unsigned AMShiftAmt = countTrailingZeros(Mask);
1801
8
1802
8
  // There is nothing we can do here unless the mask is removing some bits.
1803
8
  // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
1804
8
  if (AMShiftAmt <= 0 || AMShiftAmt > 3) 
return true0
;
1805
8
1806
8
  MVT VT = N.getSimpleValueType();
1807
8
  SDLoc DL(N);
1808
8
  SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8);
1809
8
  SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt);
1810
8
  SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT);
1811
8
  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask);
1812
8
  SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8);
1813
8
  SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt);
1814
8
1815
8
  // Insert the new nodes into the topological ordering. We must do this in
1816
8
  // a valid topological ordering as nothing is going to go back and re-sort
1817
8
  // these nodes. We continually insert before 'N' in sequence as this is
1818
8
  // essentially a pre-flattened and pre-sorted sequence of nodes. There is no
1819
8
  // hierarchy left to express.
1820
8
  insertDAGNode(DAG, N, NewSRLAmt);
1821
8
  insertDAGNode(DAG, N, NewSRL);
1822
8
  insertDAGNode(DAG, N, NewMask);
1823
8
  insertDAGNode(DAG, N, NewAnd);
1824
8
  insertDAGNode(DAG, N, NewSHLAmt);
1825
8
  insertDAGNode(DAG, N, NewSHL);
1826
8
  DAG.ReplaceAllUsesWith(N, NewSHL);
1827
8
  DAG.RemoveDeadNode(N.getNode());
1828
8
1829
8
  AM.Scale = 1 << AMShiftAmt;
1830
8
  AM.IndexReg = NewAnd;
1831
8
  return false;
1832
8
}
1833
1834
bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
1835
1.55M
                                              unsigned Depth) {
1836
1.55M
  SDLoc dl(N);
1837
1.55M
  LLVM_DEBUG({
1838
1.55M
    dbgs() << "MatchAddress: ";
1839
1.55M
    AM.dump(CurDAG);
1840
1.55M
  });
1841
1.55M
  // Limit recursion.
1842
1.55M
  if (Depth > 5)
1843
20.2k
    return matchAddressBase(N, AM);
1844
1.53M
1845
1.53M
  // If this is already a %rip relative address, we can only merge immediates
1846
1.53M
  // into it.  Instead of handling this in every case, we handle it here.
1847
1.53M
  // RIP relative addressing: %rip + 32-bit displacement!
1848
1.53M
  if (AM.isRIPRelative()) {
1849
5.93k
    // FIXME: JumpTable and ExternalSymbol address currently don't like
1850
5.93k
    // displacements.  It isn't very important, but this should be fixed for
1851
5.93k
    // consistency.
1852
5.93k
    if (!(AM.ES || AM.MCSym) && AM.JT != -1)
1853
0
      return true;
1854
5.93k
1855
5.93k
    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N))
1856
4.00k
      if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM))
1857
4.00k
        return false;
1858
1.93k
    return true;
1859
1.93k
  }
1860
1.52M
1861
1.52M
  switch (N.getOpcode()) {
1862
1.52M
  
default: break363k
;
1863
1.52M
  case ISD::LOCAL_RECOVER: {
1864
29
    if (!AM.hasSymbolicDisplacement() && AM.Disp == 0)
1865
17
      if (const auto *ESNode = dyn_cast<MCSymbolSDNode>(N.getOperand(0))) {
1866
17
        // Use the symbol and don't prefix it.
1867
17
        AM.MCSym = ESNode->getMCSymbol();
1868
17
        return false;
1869
17
      }
1870
12
    break;
1871
12
  }
1872
299k
  case ISD::Constant: {
1873
299k
    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
1874
299k
    if (!foldOffsetIntoAddress(Val, AM))
1875
295k
      return false;
1876
4.30k
    break;
1877
4.30k
  }
1878
4.30k
1879
121k
  case X86ISD::Wrapper:
1880
121k
  case X86ISD::WrapperRIP:
1881
121k
    if (!matchWrapper(N, AM))
1882
119k
      return false;
1883
2.56k
    break;
1884
2.56k
1885
72.2k
  case ISD::LOAD:
1886
72.2k
    if (!matchLoadInAddress(cast<LoadSDNode>(N), AM))
1887
97
      return false;
1888
72.1k
    break;
1889
72.1k
1890
148k
  case ISD::FrameIndex:
1891
148k
    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1892
148k
        AM.Base_Reg.getNode() == nullptr &&
1893
148k
        
(148k
!Subtarget->is64Bit()148k
||
isDispSafeForFrameIndex(AM.Disp)91.4k
)) {
1894
148k
      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
1895
148k
      AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
1896
148k
      return false;
1897
148k
    }
1898
27
    break;
1899
27
1900
58.7k
  case ISD::SHL:
1901
58.7k
    if (AM.IndexReg.getNode() != nullptr || 
AM.Scale != 151.0k
)
1902
7.63k
      break;
1903
51.0k
1904
51.0k
    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
1905
45.6k
      unsigned Val = CN->getZExtValue();
1906
45.6k
      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
1907
45.6k
      // that the base operand remains free for further matching. If
1908
45.6k
      // the base doesn't end up getting used, a post-processing step
1909
45.6k
      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
1910
45.6k
      if (Val == 1 || 
Val == 243.5k
||
Val == 333.9k
) {
1911
27.8k
        AM.Scale = 1 << Val;
1912
27.8k
        SDValue ShVal = N.getOperand(0);
1913
27.8k
1914
27.8k
        // Okay, we know that we have a scale by now.  However, if the scaled
1915
27.8k
        // value is an add of something and a constant, we can fold the
1916
27.8k
        // constant into the disp field here.
1917
27.8k
        if (CurDAG->isBaseWithConstantOffset(ShVal)) {
1918
253
          AM.IndexReg = ShVal.getOperand(0);
1919
253
          ConstantSDNode *AddVal = cast<ConstantSDNode>(ShVal.getOperand(1));
1920
253
          uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val;
1921
253
          if (!foldOffsetIntoAddress(Disp, AM))
1922
253
            return false;
1923
27.6k
        }
1924
27.6k
1925
27.6k
        AM.IndexReg = ShVal;
1926
27.6k
        return false;
1927
27.6k
      }
1928
45.6k
    }
1929
23.1k
    break;
1930
23.1k
1931
23.1k
  case ISD::SRL: {
1932
6.75k
    // Scale must not be used already.
1933
6.75k
    if (AM.IndexReg.getNode() != nullptr || 
AM.Scale != 16.28k
)
break464
;
1934
6.28k
1935
6.28k
    // We only handle up to 64-bit values here as those are what matter for
1936
6.28k
    // addressing mode optimizations.
1937
6.28k
    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
1938
6.28k
           "Unexpected value size!");
1939
6.28k
1940
6.28k
    SDValue And = N.getOperand(0);
1941
6.28k
    if (And.getOpcode() != ISD::AND) 
break6.07k
;
1942
211
    SDValue X = And.getOperand(0);
1943
211
1944
211
    // The mask used for the transform is expected to be post-shift, but we
1945
211
    // found the shift first so just apply the shift to the mask before passing
1946
211
    // it down.
1947
211
    if (!isa<ConstantSDNode>(N.getOperand(1)) ||
1948
211
        !isa<ConstantSDNode>(And.getOperand(1)))
1949
0
      break;
1950
211
    uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1);
1951
211
1952
211
    // Try to fold the mask and shift into the scale, and return false if we
1953
211
    // succeed.
1954
211
    if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM))
1955
0
      return false;
1956
211
    break;
1957
211
  }
1958
211
1959
643
  case ISD::SMUL_LOHI:
1960
643
  case ISD::UMUL_LOHI:
1961
643
    // A mul_lohi where we need the low part can be folded as a plain multiply.
1962
643
    if (N.getResNo() != 0) 
break639
;
1963
4
    LLVM_FALLTHROUGH;
1964
14.1k
  case ISD::MUL:
1965
14.1k
  case X86ISD::MUL_IMM:
1966
14.1k
    // X*[3,5,9] -> X+X*[2,4,8]
1967
14.1k
    if (AM.BaseType == X86ISelAddressMode::RegBase &&
1968
14.1k
        AM.Base_Reg.getNode() == nullptr &&
1969
14.1k
        
AM.IndexReg.getNode() == nullptr10.4k
) {
1970
9.87k
      if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1)))
1971
7.44k
        if (CN->getZExtValue() == 3 || 
CN->getZExtValue() == 56.00k
||
1972
7.44k
            
CN->getZExtValue() == 95.20k
) {
1973
2.63k
          AM.Scale = unsigned(CN->getZExtValue())-1;
1974
2.63k
1975
2.63k
          SDValue MulVal = N.getOperand(0);
1976
2.63k
          SDValue Reg;
1977
2.63k
1978
2.63k
          // Okay, we know that we have a scale by now.  However, if the scaled
1979
2.63k
          // value is an add of something and a constant, we can fold the
1980
2.63k
          // constant into the disp field here.
1981
2.63k
          if (MulVal.getNode()->getOpcode() == ISD::ADD && 
MulVal.hasOneUse()44
&&
1982
2.63k
              
isa<ConstantSDNode>(MulVal.getOperand(1))25
) {
1983
0
            Reg = MulVal.getOperand(0);
1984
0
            ConstantSDNode *AddVal =
1985
0
              cast<ConstantSDNode>(MulVal.getOperand(1));
1986
0
            uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue();
1987
0
            if (foldOffsetIntoAddress(Disp, AM))
1988
0
              Reg = N.getOperand(0);
1989
2.63k
          } else {
1990
2.63k
            Reg = N.getOperand(0);
1991
2.63k
          }
1992
2.63k
1993
2.63k
          AM.IndexReg = AM.Base_Reg = Reg;
1994
2.63k
          return false;
1995
2.63k
        }
1996
11.4k
    }
1997
11.4k
    break;
1998
11.4k
1999
15.5k
  case ISD::SUB: {
2000
15.5k
    // Given A-B, if A can be completely folded into the address and
2001
15.5k
    // the index field with the index field unused, use -B as the index.
2002
15.5k
    // This is a win if a has multiple parts that can be folded into
2003
15.5k
    // the address. Also, this saves a mov if the base register has
2004
15.5k
    // other uses, since it avoids a two-address sub instruction, however
2005
15.5k
    // it costs an additional mov if the index register has other uses.
2006
15.5k
2007
15.5k
    // Add an artificial use to this node so that we can keep track of
2008
15.5k
    // it if it gets CSE'd with a different node.
2009
15.5k
    HandleSDNode Handle(N);
2010
15.5k
2011
15.5k
    // Test if the LHS of the sub can be folded.
2012
15.5k
    X86ISelAddressMode Backup = AM;
2013
15.5k
    if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) {
2014
332
      N = Handle.getValue();
2015
332
      AM = Backup;
2016
332
      break;
2017
332
    }
2018
15.2k
    N = Handle.getValue();
2019
15.2k
    // Test if the index field is free for use.
2020
15.2k
    if (AM.IndexReg.getNode() || 
AM.isRIPRelative()13.9k
) {
2021
1.26k
      AM = Backup;
2022
1.26k
      break;
2023
1.26k
    }
2024
13.9k
2025
13.9k
    int Cost = 0;
2026
13.9k
    SDValue RHS = N.getOperand(1);
2027
13.9k
    // If the RHS involves a register with multiple uses, this
2028
13.9k
    // transformation incurs an extra mov, due to the neg instruction
2029
13.9k
    // clobbering its operand.
2030
13.9k
    if (!RHS.getNode()->hasOneUse() ||
2031
13.9k
        
RHS.getNode()->getOpcode() == ISD::CopyFromReg7.72k
||
2032
13.9k
        
RHS.getNode()->getOpcode() == ISD::TRUNCATE3.56k
||
2033
13.9k
        
RHS.getNode()->getOpcode() == ISD::ANY_EXTEND3.40k
||
2034
13.9k
        
(3.37k
RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND3.37k
&&
2035
3.37k
         
RHS.getOperand(0).getValueType() == MVT::i32221
))
2036
10.6k
      ++Cost;
2037
13.9k
    // If the base is a register with multiple uses, this
2038
13.9k
    // transformation may save a mov.
2039
13.9k
    if ((AM.BaseType == X86ISelAddressMode::RegBase && 
AM.Base_Reg.getNode()13.9k
&&
2040
13.9k
         
!AM.Base_Reg.getNode()->hasOneUse()9.98k
) ||
2041
13.9k
        
AM.BaseType == X86ISelAddressMode::FrameIndexBase10.9k
)
2042
2.98k
      --Cost;
2043
13.9k
    // If the folded LHS was interesting, this transformation saves
2044
13.9k
    // address arithmetic.
2045
13.9k
    if ((AM.hasSymbolicDisplacement() && 
!Backup.hasSymbolicDisplacement()1
) +
2046
13.9k
        ((AM.Disp != 0) && 
(Backup.Disp == 0)1.18k
) +
2047
13.9k
        (AM.Segment.getNode() && 
!Backup.Segment.getNode()13.5k
) >= 2)
2048
0
      --Cost;
2049
13.9k
    // If it doesn't look like it may be an overall win, don't do it.
2050
13.9k
    if (Cost >= 0) {
2051
13.0k
      AM = Backup;
2052
13.0k
      break;
2053
13.0k
    }
2054
930
2055
930
    // Ok, the transformation is legal and appears profitable. Go for it.
2056
930
    // Negation will be emitted later to avoid creating dangling nodes if this
2057
930
    // was an unprofitable LEA.
2058
930
    AM.IndexReg = RHS;
2059
930
    AM.NegateIndex = true;
2060
930
    AM.Scale = 1;
2061
930
    return false;
2062
930
  }
2063
930
2064
391k
  case ISD::ADD:
2065
391k
    if (!matchAdd(N, AM, Depth))
2066
362k
      return false;
2067
28.2k
    break;
2068
28.2k
2069
28.2k
  case ISD::OR:
2070
23.0k
    // We want to look through a transform in InstCombine and DAGCombiner that
2071
23.0k
    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
2072
23.0k
    // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3))
2073
23.0k
    // An 'lea' can then be used to match the shift (multiply) and add:
2074
23.0k
    // and $1, %esi
2075
23.0k
    // lea (%rsi, %rdi, 8), %rax
2076
23.0k
    if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) &&
2077
23.0k
        
!matchAdd(N, AM, Depth)18.3k
)
2078
16.3k
      return false;
2079
6.75k
    break;
2080
6.75k
2081
10.0k
  case ISD::AND: {
2082
10.0k
    // Perform some heroic transforms on an and of a constant-count shift
2083
10.0k
    // with a constant to enable use of the scaled offset field.
2084
10.0k
2085
10.0k
    // Scale must not be used already.
2086
10.0k
    if (AM.IndexReg.getNode() != nullptr || 
AM.Scale != 18.23k
)
break1.80k
;
2087
8.23k
2088
8.23k
    // We only handle up to 64-bit values here as those are what matter for
2089
8.23k
    // addressing mode optimizations.
2090
8.23k
    assert(N.getSimpleValueType().getSizeInBits() <= 64 &&
2091
8.23k
           "Unexpected value size!");
2092
8.23k
2093
8.23k
    if (!isa<ConstantSDNode>(N.getOperand(1)))
2094
537
      break;
2095
7.70k
2096
7.70k
    if (N.getOperand(0).getOpcode() == ISD::SRL) {
2097
3.10k
      SDValue Shift = N.getOperand(0);
2098
3.10k
      SDValue X = Shift.getOperand(0);
2099
3.10k
2100
3.10k
      uint64_t Mask = N.getConstantOperandVal(1);
2101
3.10k
2102
3.10k
      // Try to fold the mask and shift into an extract and scale.
2103
3.10k
      if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM))
2104
14
        return false;
2105
3.09k
2106
3.09k
      // Try to fold the mask and shift directly into the scale.
2107
3.09k
      if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM))
2108
2.20k
        return false;
2109
888
2110
888
      // Try to fold the mask and shift into BEXTR and scale.
2111
888
      if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget))
2112
8
        return false;
2113
5.47k
    }
2114
5.47k
2115
5.47k
    // Try to swap the mask and shift to place shifts which can be done as
2116
5.47k
    // a scale on the outside of the mask.
2117
5.47k
    if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM))
2118
58
      return false;
2119
5.41k
2120
5.41k
    break;
2121
5.41k
  }
2122
5.41k
  case ISD::ZERO_EXTEND: {
2123
2.41k
    // Try to widen a zexted shift left to the same size as its use, so we can
2124
2.41k
    // match the shift as a scale factor.
2125
2.41k
    if (AM.IndexReg.getNode() != nullptr || 
AM.Scale != 11.60k
)
2126
808
      break;
2127
1.60k
    if (N.getOperand(0).getOpcode() != ISD::SHL || 
!N.getOperand(0).hasOneUse()23
)
2128
1.57k
      break;
2129
23
2130
23
    // Give up if the shift is not a valid scale factor [1,2,3].
2131
23
    SDValue Shl = N.getOperand(0);
2132
23
    auto *ShAmtC = dyn_cast<ConstantSDNode>(Shl.getOperand(1));
2133
23
    if (!ShAmtC || 
ShAmtC->getZExtValue() > 317
)
2134
9
      break;
2135
14
2136
14
    // The narrow shift must only shift out zero bits (it must be 'nuw').
2137
14
    // That makes it safe to widen to the destination type.
2138
14
    APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(),
2139
14
                                            ShAmtC->getZExtValue());
2140
14
    if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros))
2141
6
      break;
2142
8
2143
8
    // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C)
2144
8
    MVT VT = N.getSimpleValueType();
2145
8
    SDLoc DL(N);
2146
8
    SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0));
2147
8
    SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1));
2148
8
2149
8
    // Convert the shift to scale factor.
2150
8
    AM.Scale = 1 << ShAmtC->getZExtValue();
2151
8
    AM.IndexReg = Zext;
2152
8
2153
8
    insertDAGNode(*CurDAG, N, Zext);
2154
8
    insertDAGNode(*CurDAG, N, NewShl);
2155
8
    CurDAG->ReplaceAllUsesWith(N, NewShl);
2156
8
    CurDAG->RemoveDeadNode(N.getNode());
2157
8
    return false;
2158
8
  }
2159
552k
  }
2160
552k
2161
552k
  return matchAddressBase(N, AM);
2162
552k
}
2163
2164
/// Helper for MatchAddress. Add the specified node to the
2165
/// specified addressing mode without any further recursion.
2166
573k
bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
2167
573k
  // Is the base register already occupied?
2168
573k
  if (AM.BaseType != X86ISelAddressMode::RegBase || 
AM.Base_Reg.getNode()572k
) {
2169
138k
    // If so, check to see if the scale index register is set.
2170
138k
    if (!AM.IndexReg.getNode()) {
2171
72.0k
      AM.IndexReg = N;
2172
72.0k
      AM.Scale = 1;
2173
72.0k
      return false;
2174
72.0k
    }
2175
66.0k
2176
66.0k
    // Otherwise, we cannot select it.
2177
66.0k
    return true;
2178
66.0k
  }
2179
435k
2180
435k
  // Default, generate it as a register.
2181
435k
  AM.BaseType = X86ISelAddressMode::RegBase;
2182
435k
  AM.Base_Reg = N;
2183
435k
  return false;
2184
435k
}
2185
2186
/// Helper for selectVectorAddr. Handles things that can be folded into a
2187
/// gather scatter address. The index register and scale should have already
2188
/// been handled.
2189
864
bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
2190
864
  // TODO: Support other operations.
2191
864
  switch (N.getOpcode()) {
2192
864
  case ISD::Constant: {
2193
260
    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
2194
260
    if (!foldOffsetIntoAddress(Val, AM))
2195
260
      return false;
2196
0
    break;
2197
0
  }
2198
8
  case X86ISD::Wrapper:
2199
8
    if (!matchWrapper(N, AM))
2200
7
      return false;
2201
1
    break;
2202
597
  }
2203
597
2204
597
  return matchAddressBase(N, AM);
2205
597
}
2206
2207
bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
2208
                                       SDValue &Scale, SDValue &Index,
2209
864
                                       SDValue &Disp, SDValue &Segment) {
2210
864
  X86ISelAddressMode AM;
2211
864
  auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
2212
864
  AM.IndexReg = Mgs->getIndex();
2213
864
  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
2214
864
2215
864
  unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2216
864
  // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2217
864
  if (AddrSpace == 256)
2218
0
    AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2219
864
  if (AddrSpace == 257)
2220
0
    AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2221
864
  if (AddrSpace == 258)
2222
0
    AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2223
864
2224
864
  SDLoc DL(N);
2225
864
  MVT VT = N.getSimpleValueType();
2226
864
2227
864
  // Try to match into the base and displacement fields.
2228
864
  if (matchVectorAddress(N, AM))
2229
0
    return false;
2230
864
2231
864
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2232
864
  return true;
2233
864
}
2234
2235
/// Returns true if it is able to pattern match an addressing mode.
2236
/// It returns the operands which make up the maximal addressing mode it can
2237
/// match by reference.
2238
///
2239
/// Parent is the parent node of the addr operand that is being matched.  It
2240
/// is always a load, store, atomic node, or null.  It is only null when
2241
/// checking memory operands for inline asm nodes.
2242
bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
2243
                                 SDValue &Scale, SDValue &Index,
2244
493k
                                 SDValue &Disp, SDValue &Segment) {
2245
493k
  X86ISelAddressMode AM;
2246
493k
2247
493k
  if (Parent &&
2248
493k
      // This list of opcodes are all the nodes that have an "addr:$ptr" operand
2249
493k
      // that are not a MemSDNode, and thus don't have proper addrspace info.
2250
493k
      
Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN492k
&& // unaligned loads, fixme
2251
493k
      
Parent->getOpcode() != ISD::INTRINSIC_VOID492k
&& // nontemporal stores
2252
493k
      
Parent->getOpcode() != X86ISD::TLSCALL492k
&& // Fixme
2253
493k
      
Parent->getOpcode() != X86ISD::ENQCMD492k
&& // Fixme
2254
493k
      
Parent->getOpcode() != X86ISD::ENQCMDS492k
&& // Fixme
2255
493k
      
Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP492k
&& // setjmp
2256
493k
      
Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP492k
) { // longjmp
2257
492k
    unsigned AddrSpace =
2258
492k
      cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
2259
492k
    // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
2260
492k
    if (AddrSpace == 256)
2261
355
      AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
2262
492k
    if (AddrSpace == 257)
2263
476
      AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
2264
492k
    if (AddrSpace == 258)
2265
0
      AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
2266
492k
  }
2267
493k
2268
493k
  // Save the DL and VT before calling matchAddress, it can invalidate N.
2269
493k
  SDLoc DL(N);
2270
493k
  MVT VT = N.getSimpleValueType();
2271
493k
2272
493k
  if (matchAddress(N, AM))
2273
0
    return false;
2274
493k
2275
493k
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2276
493k
  return true;
2277
493k
}
2278
2279
// We can only fold a load if all nodes between it and the root node have a
2280
// single use. If there are additional uses, we could end up duplicating the
2281
// load.
2282
1.02k
static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
2283
1.12k
  while (User != Root) {
2284
102
    if (!User->hasOneUse())
2285
4
      return false;
2286
98
    User = *User->use_begin();
2287
98
  }
2288
1.02k
2289
1.02k
  
return true1.01k
;
2290
1.02k
}
2291
2292
/// Match a scalar SSE load. In particular, we want to match a load whose top
2293
/// elements are either undef or zeros. The load flavor is derived from the
2294
/// type of N, which is either v4f32 or v2f64.
2295
///
2296
/// We also return:
2297
///   PatternChainNode: this is the matched node that has a chain input and
2298
///   output.
2299
bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
2300
                                          SDValue N, SDValue &Base,
2301
                                          SDValue &Scale, SDValue &Index,
2302
                                          SDValue &Disp, SDValue &Segment,
2303
1.02k
                                          SDValue &PatternNodeWithChain) {
2304
1.02k
  if (!hasSingleUsesFromRoot(Root, Parent))
2305
4
    return false;
2306
1.01k
2307
1.01k
  // We can allow a full vector load here since narrowing a load is ok unless
2308
1.01k
  // it's volatile.
2309
1.01k
  if (ISD::isNON_EXTLoad(N.getNode())) {
2310
66
    LoadSDNode *LD = cast<LoadSDNode>(N);
2311
66
    if (!LD->isVolatile() &&
2312
66
        IsProfitableToFold(N, LD, Root) &&
2313
66
        IsLegalToFold(N, Parent, Root, OptLevel)) {
2314
66
      PatternNodeWithChain = N;
2315
66
      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2316
66
                        Segment);
2317
66
    }
2318
953
  }
2319
953
2320
953
  // We can also match the special zero extended load opcode.
2321
953
  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
2322
44
    PatternNodeWithChain = N;
2323
44
    if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2324
44
        
IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)43
) {
2325
43
      auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
2326
43
      return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
2327
43
                        Segment);
2328
43
    }
2329
910
  }
2330
910
2331
910
  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
2332
910
  // once. Otherwise the load might get duplicated and the chain output of the
2333
910
  // duplicate load will not be observed by all dependencies.
2334
910
  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && 
N.getNode()->hasOneUse()53
) {
2335
41
    PatternNodeWithChain = N.getOperand(0);
2336
41
    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
2337
41
        IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
2338
41
        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
2339
35
      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
2340
35
      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
2341
35
                        Segment);
2342
35
    }
2343
875
  }
2344
875
2345
875
  return false;
2346
875
}
2347
2348
2349
24.2k
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
2350
24.2k
  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
2351
23.1k
    uint64_t ImmVal = CN->getZExtValue();
2352
23.1k
    if (!isUInt<32>(ImmVal))
2353
10.6k
      return false;
2354
12.4k
2355
12.4k
    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
2356
12.4k
    return true;
2357
12.4k
  }
2358
1.06k
2359
1.06k
  // In static codegen with small code model, we can get the address of a label
2360
1.06k
  // into a register with 'movl'
2361
1.06k
  if (N->getOpcode() != X86ISD::Wrapper)
2362
0
    return false;
2363
1.06k
2364
1.06k
  N = N.getOperand(0);
2365
1.06k
2366
1.06k
  // At least GNU as does not accept 'movl' for TPOFF relocations.
2367
1.06k
  // FIXME: We could use 'movl' when we know we are targeting MC.
2368
1.06k
  if (N->getOpcode() == ISD::TargetGlobalTLSAddress)
2369
1
    return false;
2370
1.06k
2371
1.06k
  Imm = N;
2372
1.06k
  if (N->getOpcode() != ISD::TargetGlobalAddress)
2373
63
    return TM.getCodeModel() == CodeModel::Small;
2374
1.00k
2375
1.00k
  Optional<ConstantRange> CR =
2376
1.00k
      cast<GlobalAddressSDNode>(N)->getGlobal()->getAbsoluteSymbolRange();
2377
1.00k
  if (!CR)
2378
994
    return TM.getCodeModel() == CodeModel::Small;
2379
8
2380
8
  return CR->getUnsignedMax().ult(1ull << 32);
2381
8
}
2382
2383
bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base,
2384
                                         SDValue &Scale, SDValue &Index,
2385
17.0k
                                         SDValue &Disp, SDValue &Segment) {
2386
17.0k
  // Save the debug loc before calling selectLEAAddr, in case it invalidates N.
2387
17.0k
  SDLoc DL(N);
2388
17.0k
2389
17.0k
  if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment))
2390
15.3k
    return false;
2391
1.70k
2392
1.70k
  RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Base);
2393
1.70k
  if (RN && 
RN->getReg() == 0185
)
2394
184
    Base = CurDAG->getRegister(0, MVT::i64);
2395
1.51k
  else if (Base.getValueType() == MVT::i32 && 
!isa<FrameIndexSDNode>(Base)1.51k
) {
2396
1.46k
    // Base could already be %rip, particularly in the x32 ABI.
2397
1.46k
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2398
1.46k
                                                     MVT::i64), 0);
2399
1.46k
    Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2400
1.46k
                                         Base);
2401
1.46k
  }
2402
1.70k
2403
1.70k
  RN = dyn_cast<RegisterSDNode>(Index);
2404
1.70k
  if (RN && 
RN->getReg() == 044
)
2405
44
    Index = CurDAG->getRegister(0, MVT::i64);
2406
1.65k
  else {
2407
1.65k
    assert(Index.getValueType() == MVT::i32 &&
2408
1.65k
           "Expect to be extending 32-bit registers for use in LEA");
2409
1.65k
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL,
2410
1.65k
                                                     MVT::i64), 0);
2411
1.65k
    Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef,
2412
1.65k
                                          Index);
2413
1.65k
  }
2414
1.70k
2415
1.70k
  return true;
2416
1.70k
}
2417
2418
/// Calls SelectAddr and determines if the maximal addressing
2419
/// mode it matches can be cost effectively emitted as an LEA instruction.
2420
bool X86DAGToDAGISel::selectLEAAddr(SDValue N,
2421
                                    SDValue &Base, SDValue &Scale,
2422
                                    SDValue &Index, SDValue &Disp,
2423
183k
                                    SDValue &Segment) {
2424
183k
  X86ISelAddressMode AM;
2425
183k
2426
183k
  // Save the DL and VT before calling matchAddress, it can invalidate N.
2427
183k
  SDLoc DL(N);
2428
183k
  MVT VT = N.getSimpleValueType();
2429
183k
2430
183k
  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
2431
183k
  // segments.
2432
183k
  SDValue Copy = AM.Segment;
2433
183k
  SDValue T = CurDAG->getRegister(0, MVT::i32);
2434
183k
  AM.Segment = T;
2435
183k
  if (matchAddress(N, AM))
2436
0
    return false;
2437
183k
  assert (T == AM.Segment);
2438
183k
  AM.Segment = Copy;
2439
183k
2440
183k
  unsigned Complexity = 0;
2441
183k
  if (AM.BaseType == X86ISelAddressMode::RegBase && 
AM.Base_Reg.getNode()156k
)
2442
152k
    Complexity = 1;
2443
30.4k
  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
2444
27.2k
    Complexity = 4;
2445
183k
2446
183k
  if (AM.IndexReg.getNode())
2447
32.0k
    Complexity++;
2448
183k
2449
183k
  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
2450
183k
  // a simple shift.
2451
183k
  if (AM.Scale > 1)
2452
9.65k
    Complexity++;
2453
183k
2454
183k
  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
2455
183k
  // to a LEA. This is determined with some experimentation but is by no means
2456
183k
  // optimal (especially for code size consideration). LEA is nice because of
2457
183k
  // its three-address nature. Tweak the cost function again when we can run
2458
183k
  // convertToThreeAddress() at register allocation time.
2459
183k
  if (AM.hasSymbolicDisplacement()) {
2460
46.2k
    // For X86-64, always use LEA to materialize RIP-relative addresses.
2461
46.2k
    if (Subtarget->is64Bit())
2462
38.8k
      Complexity = 4;
2463
7.42k
    else
2464
7.42k
      Complexity += 2;
2465
46.2k
  }
2466
183k
2467
183k
  // Heuristic: try harder to form an LEA from ADD if the operands set flags.
2468
183k
  // Unlike ADD, LEA does not affect flags, so we will be less likely to require
2469
183k
  // duplicating flag-producing instructions later in the pipeline.
2470
183k
  if (N.getOpcode() == ISD::ADD) {
2471
78.3k
    auto isMathWithFlags = [](SDValue V) {
2472
78.3k
      switch (V.getOpcode()) {
2473
78.3k
      case X86ISD::ADD:
2474
58
      case X86ISD::SUB:
2475
58
      case X86ISD::ADC:
2476
58
      case X86ISD::SBB:
2477
58
      /* TODO: These opcodes can be added safely, but we may want to justify
2478
58
               their inclusion for different reasons (better for reg-alloc).
2479
58
      case X86ISD::SMUL:
2480
58
      case X86ISD::UMUL:
2481
58
      case X86ISD::OR:
2482
58
      case X86ISD::XOR:
2483
58
      case X86ISD::AND:
2484
58
      */
2485
58
        // Value 1 is the flag output of the node - verify it's not dead.
2486
58
        return !SDValue(V.getNode(), 1).use_empty();
2487
78.2k
      default:
2488
78.2k
        return false;
2489
78.3k
      }
2490
78.3k
    };
2491
78.3k
    // TODO: This could be an 'or' rather than 'and' to make the transform more
2492
78.3k
    //       likely to happen. We might want to factor in whether there's a
2493
78.3k
    //       load folding opportunity for the math op that disappears with LEA.
2494
78.3k
    if (isMathWithFlags(N.getOperand(0)) && 
isMathWithFlags(N.getOperand(1))7
)
2495
2
      Complexity++;
2496
78.3k
  }
2497
183k
2498
183k
  if (AM.Disp)
2499
57.5k
    Complexity++;
2500
183k
2501
183k
  // If it isn't worth using an LEA, reject it.
2502
183k
  if (Complexity <= 2)
2503
97.8k
    return false;
2504
85.3k
2505
85.3k
  getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
2506
85.3k
  return true;
2507
85.3k
}
2508
2509
/// This is only run on TargetGlobalTLSAddress nodes.
2510
bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
2511
                                        SDValue &Scale, SDValue &Index,
2512
63
                                        SDValue &Disp, SDValue &Segment) {
2513
63
  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
2514
63
  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
2515
63
2516
63
  X86ISelAddressMode AM;
2517
63
  AM.GV = GA->getGlobal();
2518
63
  AM.Disp += GA->getOffset();
2519
63
  AM.SymbolFlags = GA->getTargetFlags();
2520
63
2521
63
  MVT VT = N.getSimpleValueType();
2522
63
  if (VT == MVT::i32) {
2523
32
    AM.Scale = 1;
2524
32
    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
2525
32
  }
2526
63
2527
63
  getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment);
2528
63
  return true;
2529
63
}
2530
2531
200k
bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
2532
200k
  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
2533
124k
    Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
2534
124k
                                   N.getValueType());
2535
124k
    return true;
2536
124k
  }
2537
75.9k
2538
75.9k
  // Keep track of the original value type and whether this value was
2539
75.9k
  // truncated. If we see a truncation from pointer type to VT that truncates
2540
75.9k
  // bits that are known to be zero, we can use a narrow reference.
2541
75.9k
  EVT VT = N.getValueType();
2542
75.9k
  bool WasTruncated = false;
2543
75.9k
  if (N.getOpcode() == ISD::TRUNCATE) {
2544
4.57k
    WasTruncated = true;
2545
4.57k
    N = N.getOperand(0);
2546
4.57k
  }
2547
75.9k
2548
75.9k
  if (N.getOpcode() != X86ISD::Wrapper)
2549
74.7k
    return false;
2550
1.20k
2551
1.20k
  // We can only use non-GlobalValues as immediates if they were not truncated,
2552
1.20k
  // as we do not have any range information. If we have a GlobalValue and the
2553
1.20k
  // address was not truncated, we can select it as an operand directly.
2554
1.20k
  unsigned Opc = N.getOperand(0)->getOpcode();
2555
1.20k
  if (Opc != ISD::TargetGlobalAddress || 
!WasTruncated1.10k
) {
2556
1.19k
    Op = N.getOperand(0);
2557
1.19k
    // We can only select the operand directly if we didn't have to look past a
2558
1.19k
    // truncate.
2559
1.19k
    return !WasTruncated;
2560
1.19k
  }
2561
15
2562
15
  // Check that the global's range fits into VT.
2563
15
  auto *GA = cast<GlobalAddressSDNode>(N.getOperand(0));
2564
15
  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2565
15
  if (!CR || 
CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())13
)
2566
2
    return false;
2567
13
2568
13
  // Okay, we can use a narrow reference.
2569
13
  Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT,
2570
13
                                      GA->getOffset(), GA->getTargetFlags());
2571
13
  return true;
2572
13
}
2573
2574
bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
2575
                                  SDValue &Base, SDValue &Scale,
2576
                                  SDValue &Index, SDValue &Disp,
2577
8.68k
                                  SDValue &Segment) {
2578
8.68k
  if (!ISD::isNON_EXTLoad(N.getNode()) ||
2579
8.68k
      
!IsProfitableToFold(N, P, Root)2.70k
||
2580
8.68k
      
!IsLegalToFold(N, P, Root, OptLevel)494
)
2581
8.19k
    return false;
2582
490
2583
490
  return selectAddr(N.getNode(),
2584
490
                    N.getOperand(1), Base, Scale, Index, Disp, Segment);
2585
490
}
2586
2587
/// Return an SDNode that returns the value of the global base register.
2588
/// Output instructions required to initialize the global base register,
2589
/// if necessary.
2590
8.61k
SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
2591
8.61k
  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
2592
8.61k
  auto &DL = MF->getDataLayout();
2593
8.61k
  return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
2594
8.61k
}
2595
2596
174k
bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
2597
174k
  if (N->getOpcode() == ISD::TRUNCATE)
2598
5.79k
    N = N->getOperand(0).getNode();
2599
174k
  if (N->getOpcode() != X86ISD::Wrapper)
2600
174k
    return false;
2601
40
2602
40
  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
2603
40
  if (!GA)
2604
2
    return false;
2605
38
2606
38
  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
2607
38
  return CR && 
CR->getSignedMin().sge(-1ull << Width)11
&&
2608
38
         
CR->getSignedMax().slt(1ull << Width)7
;
2609
38
}
2610
2611
3.88k
static X86::CondCode getCondFromNode(SDNode *N) {
2612
3.88k
  assert(N->isMachineOpcode() && "Unexpected node");
2613
3.88k
  X86::CondCode CC = X86::COND_INVALID;
2614
3.88k
  unsigned Opc = N->getMachineOpcode();
2615
3.88k
  if (Opc == X86::JCC_1)
2616
3.25k
    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(1));
2617
635
  else if (Opc == X86::SETCCr)
2618
200
    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(0));
2619
435
  else if (Opc == X86::SETCCm)
2620
3
    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(5));
2621
432
  else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr ||
2622
432
           
Opc == X86::CMOV64rr363
)
2623
113
    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(2));
2624
319
  else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm ||
2625
319
           Opc == X86::CMOV64rm)
2626
0
    CC = static_cast<X86::CondCode>(N->getConstantOperandVal(6));
2627
3.88k
2628
3.88k
  return CC;
2629
3.88k
}
2630
2631
/// Test whether the given X86ISD::CMP node has any users that use a flag
2632
/// other than ZF.
2633
1.48k
bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const {
2634
1.48k
  // Examine each user of the node.
2635
1.48k
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2636
2.89k
         UI != UE; 
++UI1.41k
) {
2637
1.48k
    // Only check things that use the flags.
2638
1.48k
    if (UI.getUse().getResNo() != Flags.getResNo())
2639
0
      continue;
2640
1.48k
    // Only examine CopyToReg uses that copy to EFLAGS.
2641
1.48k
    if (UI->getOpcode() != ISD::CopyToReg ||
2642
1.48k
        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2643
0
      return false;
2644
1.48k
    // Examine each user of the CopyToReg use.
2645
1.48k
    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2646
4.23k
           FlagUE = UI->use_end(); FlagUI != FlagUE; 
++FlagUI2.75k
) {
2647
2.82k
      // Only examine the Flag result.
2648
2.82k
      if (FlagUI.getUse().getResNo() != 1) 
continue1.34k
;
2649
1.48k
      // Anything unusual: assume conservatively.
2650
1.48k
      if (!FlagUI->isMachineOpcode()) 
return false0
;
2651
1.48k
      // Examine the condition code of the user.
2652
1.48k
      X86::CondCode CC = getCondFromNode(*FlagUI);
2653
1.48k
2654
1.48k
      switch (CC) {
2655
1.48k
      // Comparisons which only use the zero flag.
2656
1.48k
      
case X86::COND_E: 1.41k
case X86::COND_NE:
2657
1.41k
        continue;
2658
1.41k
      // Anything else: assume conservatively.
2659
1.41k
      default:
2660
69
        return false;
2661
1.48k
      }
2662
1.48k
    }
2663
1.48k
  }
2664
1.48k
  
return true1.41k
;
2665
1.48k
}
2666
2667
/// Test whether the given X86ISD::CMP node has any uses which require the SF
2668
/// flag to be accurate.
2669
35
bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const {
2670
35
  // Examine each user of the node.
2671
35
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2672
62
         UI != UE; 
++UI27
) {
2673
35
    // Only check things that use the flags.
2674
35
    if (UI.getUse().getResNo() != Flags.getResNo())
2675
0
      continue;
2676
35
    // Only examine CopyToReg uses that copy to EFLAGS.
2677
35
    if (UI->getOpcode() != ISD::CopyToReg ||
2678
35
        cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2679
0
      return false;
2680
35
    // Examine each user of the CopyToReg use.
2681
35
    for (SDNode::use_iterator FlagUI = UI->use_begin(),
2682
72
           FlagUE = UI->use_end(); FlagUI != FlagUE; 
++FlagUI37
) {
2683
45
      // Only examine the Flag result.
2684
45
      if (FlagUI.getUse().getResNo() != 1) 
continue10
;
2685
35
      // Anything unusual: assume conservatively.
2686
35
      if (!FlagUI->isMachineOpcode()) 
return false0
;
2687
35
      // Examine the condition code of the user.
2688
35
      X86::CondCode CC = getCondFromNode(*FlagUI);
2689
35
2690
35
      switch (CC) {
2691
35
      // Comparisons which don't examine the SF flag.
2692
35
      
case X86::COND_A: 27
case X86::COND_AE:
2693
27
      case X86::COND_B: case X86::COND_BE:
2694
27
      case X86::COND_E: case X86::COND_NE:
2695
27
      case X86::COND_O: case X86::COND_NO:
2696
27
      case X86::COND_P: case X86::COND_NP:
2697
27
        continue;
2698
27
      // Anything else: assume conservatively.
2699
27
      default:
2700
8
        return false;
2701
35
      }
2702
35
    }
2703
35
  }
2704
35
  
return true27
;
2705
35
}
2706
2707
2.37k
static bool mayUseCarryFlag(X86::CondCode CC) {
2708
2.37k
  switch (CC) {
2709
2.37k
  // Comparisons which don't examine the CF flag.
2710
2.37k
  
case X86::COND_O: 1.89k
case X86::COND_NO:
2711
1.89k
  case X86::COND_E: case X86::COND_NE:
2712
1.89k
  case X86::COND_S: case X86::COND_NS:
2713
1.89k
  case X86::COND_P: case X86::COND_NP:
2714
1.89k
  case X86::COND_L: case X86::COND_GE:
2715
1.89k
  case X86::COND_G: case X86::COND_LE:
2716
1.89k
    return false;
2717
1.89k
  // Anything else: assume conservatively.
2718
1.89k
  default:
2719
477
    return true;
2720
2.37k
  }
2721
2.37k
}
2722
2723
/// Test whether the given node which sets flags has any uses which require the
2724
/// CF flag to be accurate.
2725
2.55k
 bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const {
2726
2.55k
  // Examine each user of the node.
2727
2.55k
  for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end();
2728
6.69k
         UI != UE; 
++UI4.13k
) {
2729
4.61k
    // Only check things that use the flags.
2730
4.61k
    if (UI.getUse().getResNo() != Flags.getResNo())
2731
2.23k
      continue;
2732
2.37k
2733
2.37k
    unsigned UIOpc = UI->getOpcode();
2734
2.37k
2735
2.37k
    if (UIOpc == ISD::CopyToReg) {
2736
2.37k
      // Only examine CopyToReg uses that copy to EFLAGS.
2737
2.37k
      if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
2738
0
        return false;
2739
2.37k
      // Examine each user of the CopyToReg use.
2740
2.37k
      for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
2741
6.07k
           FlagUI != FlagUE; 
++FlagUI3.70k
) {
2742
4.18k
        // Only examine the Flag result.
2743
4.18k
        if (FlagUI.getUse().getResNo() != 1)
2744
1.81k
          continue;
2745
2.37k
        // Anything unusual: assume conservatively.
2746
2.37k
        if (!FlagUI->isMachineOpcode())
2747
0
          return false;
2748
2.37k
        // Examine the condition code of the user.
2749
2.37k
        X86::CondCode CC = getCondFromNode(*FlagUI);
2750
2.37k
2751
2.37k
        if (mayUseCarryFlag(CC))
2752
477
          return false;
2753
2.37k
      }
2754
2.37k
2755
2.37k
      // This CopyToReg is ok. Move on to the next user.
2756
2.37k
      
continue1.89k
;
2757
6
    }
2758
6
2759
6
    // This might be an unselected node. So look for the pre-isel opcodes that
2760
6
    // use flags.
2761
6
    unsigned CCOpNo;
2762
6
    switch (UIOpc) {
2763
6
    default:
2764
2
      // Something unusual. Be conservative.
2765
2
      return false;
2766
6
    
case X86ISD::SETCC: CCOpNo = 0; break1
;
2767
6
    
case X86ISD::SETCC_CARRY: CCOpNo = 0; break0
;
2768
6
    
case X86ISD::CMOV: CCOpNo = 2; break3
;
2769
6
    
case X86ISD::BRCOND: CCOpNo = 2; break0
;
2770
4
    }
2771
4
2772
4
    X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo);
2773
4
    if (mayUseCarryFlag(CC))
2774
0
      return false;
2775
4
  }
2776
2.55k
  
return true2.07k
;
2777
2.55k
}
2778
2779
/// Check whether or not the chain ending in StoreNode is suitable for doing
2780
/// the {load; op; store} to modify transformation.
2781
static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
2782
                                        SDValue StoredVal, SelectionDAG *CurDAG,
2783
                                        unsigned LoadOpNo,
2784
                                        LoadSDNode *&LoadNode,
2785
1.84k
                                        SDValue &InputChain) {
2786
1.84k
  // Is the stored value result 0 of the operation?
2787
1.84k
  if (StoredVal.getResNo() != 0) 
return false0
;
2788
1.84k
2789
1.84k
  // Are there other uses of the operation other than the store?
2790
1.84k
  if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) 
return false282
;
2791
1.55k
2792
1.55k
  // Is the store non-extending and non-indexed?
2793
1.55k
  if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal())
2794
0
    return false;
2795
1.55k
2796
1.55k
  SDValue Load = StoredVal->getOperand(LoadOpNo);
2797
1.55k
  // Is the stored value a non-extending and non-indexed load?
2798
1.55k
  if (!ISD::isNormalLoad(Load.getNode())) 
return false989
;
2799
570
2800
570
  // Return LoadNode by reference.
2801
570
  LoadNode = cast<LoadSDNode>(Load);
2802
570
2803
570
  // Is store the only read of the loaded value?
2804
570
  if (!Load.hasOneUse())
2805
24
    return false;
2806
546
2807
546
  // Is the address of the store the same as the load?
2808
546
  if (LoadNode->getBasePtr() != StoreNode->getBasePtr() ||
2809
546
      
LoadNode->getOffset() != StoreNode->getOffset()326
)
2810
220
    return false;
2811
326
2812
326
  bool FoundLoad = false;
2813
326
  SmallVector<SDValue, 4> ChainOps;
2814
326
  SmallVector<const SDNode *, 4> LoopWorklist;
2815
326
  SmallPtrSet<const SDNode *, 16> Visited;
2816
326
  const unsigned int Max = 1024;
2817
326
2818
326
  //  Visualization of Load-Op-Store fusion:
2819
326
  // -------------------------
2820
326
  // Legend:
2821
326
  //    *-lines = Chain operand dependencies.
2822
326
  //    |-lines = Normal operand dependencies.
2823
326
  //    Dependencies flow down and right. n-suffix references multiple nodes.
2824
326
  //
2825
326
  //        C                        Xn  C
2826
326
  //        *                         *  *
2827
326
  //        *                          * *
2828
326
  //  Xn  A-LD    Yn                    TF         Yn
2829
326
  //   *    * \   |                       *        |
2830
326
  //    *   *  \  |                        *       |
2831
326
  //     *  *   \ |             =>       A--LD_OP_ST
2832
326
  //      * *    \|                                 \
2833
326
  //       TF    OP                                  \
2834
326
  //         *   | \                                  Zn
2835
326
  //          *  |  \
2836
326
  //         A-ST    Zn
2837
326
  //
2838
326
2839
326
  // This merge induced dependences from: #1: Xn -> LD, OP, Zn
2840
326
  //                                      #2: Yn -> LD
2841
326
  //                                      #3: ST -> Zn
2842
326
2843
326
  // Ensure the transform is safe by checking for the dual
2844
326
  // dependencies to make sure we do not induce a loop.
2845
326
2846
326
  // As LD is a predecessor to both OP and ST we can do this by checking:
2847
326
  //  a). if LD is a predecessor to a member of Xn or Yn.
2848
326
  //  b). if a Zn is a predecessor to ST.
2849
326
2850
326
  // However, (b) can only occur through being a chain predecessor to
2851
326
  // ST, which is the same as Zn being a member or predecessor of Xn,
2852
326
  // which is a subset of LD being a predecessor of Xn. So it's
2853
326
  // subsumed by check (a).
2854
326
2855
326
  SDValue Chain = StoreNode->getChain();
2856
326
2857
326
  // Gather X elements in ChainOps.
2858
326
  if (Chain == Load.getValue(1)) {
2859
229
    FoundLoad = true;
2860
229
    ChainOps.push_back(Load.getOperand(0));
2861
229
  } else 
if (97
Chain.getOpcode() == ISD::TokenFactor97
) {
2862
493
    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; 
++i396
) {
2863
396
      SDValue Op = Chain.getOperand(i);
2864
396
      if (Op == Load.getValue(1)) {
2865
97
        FoundLoad = true;
2866
97
        // Drop Load, but keep its chain. No cycle check necessary.
2867
97
        ChainOps.push_back(Load.getOperand(0));
2868
97
        continue;
2869
97
      }
2870
299
      LoopWorklist.push_back(Op.getNode());
2871
299
      ChainOps.push_back(Op);
2872
299
    }
2873
97
  }
2874
326
2875
326
  if (!FoundLoad)
2876
0
    return false;
2877
326
2878
326
  // Worklist is currently Xn. Add Yn to worklist.
2879
326
  for (SDValue Op : StoredVal->ops())
2880
735
    if (Op.getNode() != LoadNode)
2881
409
      LoopWorklist.push_back(Op.getNode());
2882
326
2883
326
  // Check (a) if Load is a predecessor to Xn + Yn
2884
326
  if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max,
2885
326
                                   true))
2886
1
    return false;
2887
325
2888
325
  InputChain =
2889
325
      CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps);
2890
325
  return true;
2891
325
}
2892
2893
// Change a chain of {load; op; store} of the same value into a simple op
2894
// through memory of that value, if the uses of the modified value and its
2895
// address are suitable.
2896
//
2897
// The tablegen pattern memory operand pattern is currently not able to match
2898
// the case where the EFLAGS on the original operation are used.
2899
//
2900
// To move this to tablegen, we'll need to improve tablegen to allow flags to
2901
// be transferred from a node in the pattern to the result node, probably with
2902
// a new keyword. For example, we have this
2903
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2904
//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2905
//   (implicit EFLAGS)]>;
2906
// but maybe need something like this
2907
// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
2908
//  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
2909
//   (transferrable EFLAGS)]>;
2910
//
2911
// Until then, we manually fold these and instruction select the operation
2912
// here.
2913
200k
bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
2914
200k
  StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
2915
200k
  SDValue StoredVal = StoreNode->getOperand(1);
2916
200k
  unsigned Opc = StoredVal->getOpcode();
2917
200k
2918
200k
  // Before we try to select anything, make sure this is memory operand size
2919
200k
  // and opcode we can handle. Note that this must match the code below that
2920
200k
  // actually lowers the opcodes.
2921
200k
  EVT MemVT = StoreNode->getMemoryVT();
2922
200k
  if (MemVT != MVT::i64 && 
MemVT != MVT::i32133k
&&
MemVT != MVT::i1647.7k
&&
2923
200k
      
MemVT != MVT::i843.6k
)
2924
28.8k
    return false;
2925
171k
2926
171k
  bool IsCommutable = false;
2927
171k
  bool IsNegate = false;
2928
171k
  switch (Opc) {
2929
171k
  default:
2930
170k
    return false;
2931
171k
  case X86ISD::SUB:
2932
109
    IsNegate = isNullConstant(StoredVal.getOperand(0));
2933
109
    break;
2934
171k
  case X86ISD::SBB:
2935
82
    break;
2936
171k
  case X86ISD::ADD:
2937
970
  case X86ISD::ADC:
2938
970
  case X86ISD::AND:
2939
970
  case X86ISD::OR:
2940
970
  case X86ISD::XOR:
2941
970
    IsCommutable = true;
2942
970
    break;
2943
1.16k
  }
2944
1.16k
2945
1.16k
  unsigned LoadOpNo = IsNegate ? 
113
:
01.14k
;
2946
1.16k
  LoadSDNode *LoadNode = nullptr;
2947
1.16k
  SDValue InputChain;
2948
1.16k
  if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2949
1.16k
                                   LoadNode, InputChain)) {
2950
851
    if (!IsCommutable)
2951
171
      return false;
2952
680
2953
680
    // This operation is commutable, try the other operand.
2954
680
    LoadOpNo = 1;
2955
680
    if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo,
2956
680
                                     LoadNode, InputChain))
2957
665
      return false;
2958
325
  }
2959
325
2960
325
  SDValue Base, Scale, Index, Disp, Segment;
2961
325
  if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
2962
325
                  Segment))
2963
0
    return false;
2964
325
2965
325
  auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
2966
439
                          unsigned Opc8) {
2967
439
    switch (MemVT.getSimpleVT().SimpleTy) {
2968
439
    case MVT::i64:
2969
172
      return Opc64;
2970
439
    case MVT::i32:
2971
196
      return Opc32;
2972
439
    case MVT::i16:
2973
44
      return Opc16;
2974
439
    case MVT::i8:
2975
27
      return Opc8;
2976
439
    default:
2977
0
      llvm_unreachable("Invalid size!");
2978
439
    }
2979
439
  };
2980
325
2981
325
  MachineSDNode *Result;
2982
325
  switch (Opc) {
2983
325
  case X86ISD::SUB:
2984
10
    // Handle negate.
2985
10
    if (IsNegate) {
2986
4
      unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m,
2987
4
                                     X86::NEG8m);
2988
4
      const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
2989
4
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
2990
4
                                      MVT::Other, Ops);
2991
4
      break;
2992
4
    }
2993
6
   LLVM_FALLTHROUGH;
2994
182
  case X86ISD::ADD:
2995
182
    // Try to match inc/dec.
2996
182
    if (!Subtarget->slowIncDec() || 
OptForSize3
) {
2997
179
      bool IsOne = isOneConstant(StoredVal.getOperand(1));
2998
179
      bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
2999
179
      // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
3000
179
      if ((IsOne || 
IsNegOne125
) &&
hasNoCarryFlagUses(StoredVal.getValue(1))116
) {
3001
109
        unsigned NewOpc = 
3002
109
          ((Opc == X86ISD::ADD) == IsOne)
3003
109
              ? 
SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)48
3004
109
              : 
SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m)61
;
3005
109
        const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
3006
109
        Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32,
3007
109
                                        MVT::Other, Ops);
3008
109
        break;
3009
109
      }
3010
73
    }
3011
73
    LLVM_FALLTHROUGH;
3012
212
  case X86ISD::ADC:
3013
212
  case X86ISD::SBB:
3014
212
  case X86ISD::AND:
3015
212
  case X86ISD::OR:
3016
212
  case X86ISD::XOR: {
3017
212
    auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
3018
212
      switch (Opc) {
3019
212
      case X86ISD::ADD:
3020
67
        return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
3021
67
                            X86::ADD8mr);
3022
212
      case X86ISD::ADC:
3023
72
        return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr,
3024
72
                            X86::ADC8mr);
3025
212
      case X86ISD::SUB:
3026
6
        return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
3027
6
                            X86::SUB8mr);
3028
212
      case X86ISD::SBB:
3029
10
        return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr,
3030
10
                            X86::SBB8mr);
3031
212
      case X86ISD::AND:
3032
27
        return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
3033
27
                            X86::AND8mr);
3034
212
      case X86ISD::OR:
3035
15
        return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
3036
212
      case X86ISD::XOR:
3037
15
        return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
3038
15
                            X86::XOR8mr);
3039
212
      default:
3040
0
        llvm_unreachable("Invalid opcode!");
3041
212
      }
3042
212
    };
3043
212
    auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
3044
78
      switch (Opc) {
3045
78
      case X86ISD::ADD:
3046
26
        return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
3047
78
      case X86ISD::ADC:
3048
30
        return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0);
3049
78
      case X86ISD::SUB:
3050
4
        return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
3051
78
      case X86ISD::SBB:
3052
0
        return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0);
3053
78
      case X86ISD::AND:
3054
6
        return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
3055
78
      case X86ISD::OR:
3056
6
        return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
3057
78
      case X86ISD::XOR:
3058
6
        return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
3059
78
      default:
3060
0
        llvm_unreachable("Invalid opcode!");
3061
78
      }
3062
78
    };
3063
212
    auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
3064
36
      switch (Opc) {
3065
36
      case X86ISD::ADD:
3066
12
        return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
3067
12
                            X86::ADD8mi);
3068
36
      case X86ISD::ADC:
3069
7
        return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi,
3070
7
                            X86::ADC8mi);
3071
36
      case X86ISD::SUB:
3072
2
        return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
3073
2
                            X86::SUB8mi);
3074
36
      case X86ISD::SBB:
3075
0
        return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi,
3076
0
                            X86::SBB8mi);
3077
36
      case X86ISD::AND:
3078
5
        return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
3079
5
                            X86::AND8mi);
3080
36
      case X86ISD::OR:
3081
5
        return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
3082
5
                            X86::OR8mi);
3083
36
      case X86ISD::XOR:
3084
5
        return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
3085
5
                            X86::XOR8mi);
3086
36
      default:
3087
0
        llvm_unreachable("Invalid opcode!");
3088
36
      }
3089
36
    };
3090
212
3091
212
    unsigned NewOpc = SelectRegOpcode(Opc);
3092
212
    SDValue Operand = StoredVal->getOperand(1-LoadOpNo);
3093
212
3094
212
    // See if the operand is a constant that we can fold into an immediate
3095
212
    // operand.
3096
212
    if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
3097
120
      int64_t OperandV = OperandC->getSExtValue();
3098
120
3099
120
      // Check if we can shrink the operand enough to fit in an immediate (or
3100
120
      // fit into a smaller immediate) by negating it and switching the
3101
120
      // operation.
3102
120
      if ((Opc == X86ISD::ADD || 
Opc == X86ISD::SUB71
) &&
3103
120
          
(49
(49
MemVT != MVT::i849
&&
!isInt<8>(OperandV)46
&&
isInt<8>(-OperandV)20
) ||
3104
49
           
(44
MemVT == MVT::i6444
&&
!isInt<32>(OperandV)19
&&
3105
44
            
isInt<32>(-OperandV)7
)) &&
3106
120
          
hasNoCarryFlagUses(StoredVal.getValue(1))8
) {
3107
6
        OperandV = -OperandV;
3108
6
        Opc = Opc == X86ISD::ADD ? X86ISD::SUB : 
X86ISD::ADD0
;
3109
6
      }
3110
120
3111
120
      // First try to fit this into an Imm8 operand. If it doesn't fit, then try
3112
120
      // the larger immediate operand.
3113
120
      if (MemVT != MVT::i8 && 
isInt<8>(OperandV)112
) {
3114
78
        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3115
78
        NewOpc = SelectImm8Opcode(Opc);
3116
78
      } else 
if (42
MemVT != MVT::i6442
||
isInt<32>(OperandV)18
) {
3117
36
        Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
3118
36
        NewOpc = SelectImmOpcode(Opc);
3119
36
      }
3120
120
    }
3121
212
3122
212
    if (Opc == X86ISD::ADC || 
Opc == X86ISD::SBB140
) {
3123
82
      SDValue CopyTo =
3124
82
          CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS,
3125
82
                               StoredVal.getOperand(2), SDValue());
3126
82
3127
82
      const SDValue Ops[] = {Base,    Scale,   Index,  Disp,
3128
82
                             Segment, Operand, CopyTo, CopyTo.getValue(1)};
3129
82
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3130
82
                                      Ops);
3131
130
    } else {
3132
130
      const SDValue Ops[] = {Base,    Scale,   Index,     Disp,
3133
130
                             Segment, Operand, InputChain};
3134
130
      Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other,
3135
130
                                      Ops);
3136
130
    }
3137
212
    break;
3138
212
  }
3139
212
  default:
3140
0
    llvm_unreachable("Invalid opcode!");
3141
325
  }
3142
325
3143
325
  MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(),
3144
325
                                 LoadNode->getMemOperand()};
3145
325
  CurDAG->setNodeMemRefs(Result, MemOps);
3146
325
3147
325
  // Update Load Chain uses as well.
3148
325
  ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1));
3149
325
  ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
3150
325
  ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
3151
325
  CurDAG->RemoveDeadNode(Node);
3152
325
  return true;
3153
325
}
3154
3155
// See if this is an  X & Mask  that we can match to BEXTR/BZHI.
3156
// Where Mask is one of the following patterns:
3157
//   a) x &  (1 << nbits) - 1
3158
//   b) x & ~(-1 << nbits)
3159
//   c) x &  (-1 >> (32 - y))
3160
//   d) x << (32 - y) >> (32 - y)
3161
76.9k
bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) {
3162
76.9k
  assert(
3163
76.9k
      (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) &&
3164
76.9k
      "Should be either an and-mask, or right-shift after clearing high bits.");
3165
76.9k
3166
76.9k
  // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one.
3167
76.9k
  if (!Subtarget->hasBMI() && 
!Subtarget->hasBMI2()59.5k
)
3168
59.5k
    return false;
3169
17.4k
3170
17.4k
  MVT NVT = Node->getSimpleValueType(0);
3171
17.4k
3172
17.4k
  // Only supported for 32 and 64 bits.
3173
17.4k
  if (NVT != MVT::i32 && 
NVT != MVT::i6412.8k
)
3174
1.23k
    return false;
3175
16.1k
3176
16.1k
  SDValue NBits;
3177
16.1k
3178
16.1k
  // If we have BMI2's BZHI, we are ok with muti-use patterns.
3179
16.1k
  // Else, if we only have BMI1's BEXTR, we require one-use.
3180
16.1k
  const bool CanHaveExtraUses = Subtarget->hasBMI2();
3181
16.1k
  auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) {
3182
6.59k
    return CanHaveExtraUses ||
3183
6.59k
           
Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo())1.26k
;
3184
6.59k
  };
3185
16.1k
  auto checkOneUse = [checkUses](SDValue Op) 
{ return checkUses(Op, 1); }6.36k
;
3186
16.1k
  auto checkTwoUse = [checkUses](SDValue Op) 
{ return checkUses(Op, 2); }230
;
3187
16.1k
3188
20.4k
  auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) {
3189
20.4k
    if (V->getOpcode() == ISD::TRUNCATE && 
checkOneUse(V)722
) {
3190
706
      assert(V.getSimpleValueType() == MVT::i32 &&
3191
706
             V.getOperand(0).getSimpleValueType() == MVT::i64 &&
3192
706
             "Expected i64 -> i32 truncation");
3193
706
      V = V.getOperand(0);
3194
706
    }
3195
20.4k
    return V;
3196
20.4k
  };
3197
16.1k
3198
16.1k
  // a) x & ((1 << nbits) + (-1))
3199
16.1k
  auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation,
3200
17.7k
                        &NBits](SDValue Mask) -> bool {
3201
17.7k
    // Match `add`. Must only have one use!
3202
17.7k
    if (Mask->getOpcode() != ISD::ADD || 
!checkOneUse(Mask)1.91k
)
3203
15.7k
      return false;
3204
1.91k
    // We should be adding all-ones constant (i.e. subtracting one.)
3205
1.91k
    if (!isAllOnesConstant(Mask->getOperand(1)))
3206
1.00k
      return false;
3207
918
    // Match `1 << nbits`. Might be truncated. Must only have one use!
3208
918
    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3209
918
    if (M0->getOpcode() != ISD::SHL || 
!checkOneUse(M0)208
)
3210
710
      return false;
3211
208
    if (!isOneConstant(M0->getOperand(0)))
3212
0
      return false;
3213
208
    NBits = M0->getOperand(1);
3214
208
    return true;
3215
208
  };
3216
16.1k
3217
16.1k
  auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) {
3218
1.35k
    V = peekThroughOneUseTruncation(V);
3219
1.35k
    return CurDAG->MaskedValueIsAllOnes(
3220
1.35k
        V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(),
3221
1.35k
                                NVT.getSizeInBits()));
3222
1.35k
  };
3223
16.1k
3224
16.1k
  // b) x & ~(-1 << nbits)
3225
16.1k
  auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation,
3226
17.5k
                        &NBits](SDValue Mask) -> bool {
3227
17.5k
    // Match `~()`. Must only have one use!
3228
17.5k
    if (Mask.getOpcode() != ISD::XOR || 
!checkOneUse(Mask)1.00k
)
3229
16.5k
      return false;
3230
992
    // The -1 only has to be all-ones for the final Node's NVT.
3231
992
    if (!isAllOnes(Mask->getOperand(1)))
3232
351
      return false;
3233
641
    // Match `-1 << nbits`. Might be truncated. Must only have one use!
3234
641
    SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0));
3235
641
    if (M0->getOpcode() != ISD::SHL || 
!checkOneUse(M0)358
)
3236
283
      return false;
3237
358
    // The -1 only has to be all-ones for the final Node's NVT.
3238
358
    if (!isAllOnes(M0->getOperand(0)))
3239
168
      return false;
3240
190
    NBits = M0->getOperand(1);
3241
190
    return true;
3242
190
  };
3243
16.1k
3244
16.1k
  // Match potentially-truncated (bitwidth - y)
3245
16.1k
  auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt,
3246
16.1k
                                             unsigned Bitwidth) {
3247
310
    // Skip over a truncate of the shift amount.
3248
310
    if (ShiftAmt.getOpcode() == ISD::TRUNCATE) {
3249
50
      ShiftAmt = ShiftAmt.getOperand(0);
3250
50
      // The trunc should have been the only user of the real shift amount.
3251
50
      if (!checkOneUse(ShiftAmt))
3252
0
        return false;
3253
310
    }
3254
310
    // Match the shift amount as: (bitwidth - y). It should go away, too.
3255
310
    if (ShiftAmt.getOpcode() != ISD::SUB)
3256
66
      return false;
3257
244
    auto V0 = dyn_cast<ConstantSDNode>(ShiftAmt.getOperand(0));
3258
244
    if (!V0 || V0->getZExtValue() != Bitwidth)
3259
0
      return false;
3260
244
    NBits = ShiftAmt.getOperand(1);
3261
244
    return true;
3262
244
  };
3263
16.1k
3264
16.1k
  // c) x &  (-1 >> (32 - y))
3265
16.1k
  auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation,
3266
17.3k
                        matchShiftAmt](SDValue Mask) -> bool {
3267
17.3k
    // The mask itself may be truncated.
3268
17.3k
    Mask = peekThroughOneUseTruncation(Mask);
3269
17.3k
    unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits();
3270
17.3k
    // Match `l>>`. Must only have one use!
3271
17.3k
    if (Mask.getOpcode() != ISD::SRL || 
!checkOneUse(Mask)1.69k
)
3272
15.6k
      return false;
3273
1.61k
    // We should be shifting truly all-ones constant.
3274
1.61k
    if (!isAllOnesConstant(Mask.getOperand(0)))
3275
1.53k
      return false;
3276
80
    SDValue M1 = Mask.getOperand(1);
3277
80
    // The shift amount should not be used externally.
3278
80
    if (!checkOneUse(M1))
3279
0
      return false;
3280
80
    return matchShiftAmt(M1, Bitwidth);
3281
80
  };
3282
16.1k
3283
16.1k
  SDValue X;
3284
16.1k
3285
16.1k
  // d) x << (32 - y) >> (32 - y)
3286
16.1k
  auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt,
3287
16.1k
                        &X](SDNode *Node) -> bool {
3288
7.27k
    if (Node->getOpcode() != ISD::SRL)
3289
0
      return false;
3290
7.27k
    SDValue N0 = Node->getOperand(0);
3291
7.27k
    if (N0->getOpcode() != ISD::SHL || 
!checkOneUse(N0)339
)
3292
6.93k
      return false;
3293
339
    unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits();
3294
339
    SDValue N1 = Node->getOperand(1);
3295
339
    SDValue N01 = N0->getOperand(1);
3296
339
    // Both of the shifts must be by the exact same value.
3297
339
    // There should not be any uses of the shift amount outside of the pattern.
3298
339
    if (N1 != N01 || 
!checkTwoUse(N1)230
)
3299
109
      return false;
3300
230
    if (!matchShiftAmt(N1, Bitwidth))
3301
60
      return false;
3302
170
    X = N0->getOperand(0);
3303
170
    return true;
3304
170
  };
3305
16.1k
3306
16.1k
  auto matchLowBitMask = [matchPatternA, matchPatternB,
3307
17.7k
                          matchPatternC](SDValue Mask) -> bool {
3308
17.7k
    return matchPatternA(Mask) || 
matchPatternB(Mask)17.5k
||
matchPatternC(Mask)17.3k
;
3309
17.7k
  };
3310
16.1k
3311
16.1k
  if (Node->getOpcode() == ISD::AND) {
3312
8.91k
    X = Node->getOperand(0);
3313
8.91k
    SDValue Mask = Node->getOperand(1);
3314
8.91k
3315
8.91k
    if (matchLowBitMask(Mask)) {
3316
116
      // Great.
3317
8.79k
    } else {
3318
8.79k
      std::swap(X, Mask);
3319
8.79k
      if (!matchLowBitMask(Mask))
3320
8.44k
        return false;
3321
7.27k
    }
3322
7.27k
  } else if (!matchPatternD(Node))
3323
7.10k
    return false;
3324
642
3325
642
  SDLoc DL(Node);
3326
642
3327
642
  // Truncate the shift amount.
3328
642
  NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits);
3329
642
  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3330
642
3331
642
  // Insert 8-bit NBits into lowest 8 bits of 32-bit register.
3332
642
  // All the other bits are undefined, we do not care about them.
3333
642
  SDValue ImplDef = SDValue(
3334
642
      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0);
3335
642
  insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef);
3336
642
  NBits = CurDAG->getTargetInsertSubreg(X86::sub_8bit, DL, MVT::i32, ImplDef,
3337
642
                                        NBits);
3338
642
  insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3339
642
3340
642
  if (Subtarget->hasBMI2()) {
3341
362
    // Great, just emit the the BZHI..
3342
362
    if (NVT != MVT::i32) {
3343
92
      // But have to place the bit count into the wide-enough register first.
3344
92
      NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits);
3345
92
      insertDAGNode(*CurDAG, SDValue(Node, 0), NBits);
3346
92
    }
3347
362
3348
362
    SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits);
3349
362
    ReplaceNode(Node, Extract.getNode());
3350
362
    SelectCode(Extract.getNode());
3351
362
    return true;
3352
362
  }
3353
280
3354
280
  // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is
3355
280
  // *logically* shifted (potentially with one-use trunc inbetween),
3356
280
  // and the truncation was the only use of the shift,
3357
280
  // and if so look past one-use truncation.
3358
280
  {
3359
280
    SDValue RealX = peekThroughOneUseTruncation(X);
3360
280
    // FIXME: only if the shift is one-use?
3361
280
    if (RealX != X && 
RealX.getOpcode() == ISD::SRL48
)
3362
24
      X = RealX;
3363
280
  }
3364
280
3365
280
  MVT XVT = X.getSimpleValueType();
3366
280
3367
280
  // Else, emitting BEXTR requires one more step.
3368
280
  // The 'control' of BEXTR has the pattern of:
3369
280
  // [15...8 bit][ 7...0 bit] location
3370
280
  // [ bit count][     shift] name
3371
280
  // I.e. 0b000000011'00000001 means  (x >> 0b1) & 0b11
3372
280
3373
280
  // Shift NBits left by 8 bits, thus producing 'control'.
3374
280
  // This makes the low 8 bits to be zero.
3375
280
  SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8);
3376
280
  SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8);
3377
280
  insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3378
280
3379
280
  // If the 'X' is *logically* shifted, we can fold that shift into 'control'.
3380
280
  // FIXME: only if the shift is one-use?
3381
280
  if (X.getOpcode() == ISD::SRL) {
3382
128
    SDValue ShiftAmt = X.getOperand(1);
3383
128
    X = X.getOperand(0);
3384
128
3385
128
    assert(ShiftAmt.getValueType() == MVT::i8 &&
3386
128
           "Expected shift amount to be i8");
3387
128
3388
128
    // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero!
3389
128
    // We could zext to i16 in some form, but we intentionally don't do that.
3390
128
    SDValue OrigShiftAmt = ShiftAmt;
3391
128
    ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt);
3392
128
    insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt);
3393
128
3394
128
    // And now 'or' these low 8 bits of shift amount into the 'control'.
3395
128
    Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt);
3396
128
    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3397
128
  }
3398
280
3399
280
  // But have to place the 'control' into the wide-enough register first.
3400
280
  if (XVT != MVT::i32) {
3401
92
    Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control);
3402
92
    insertDAGNode(*CurDAG, SDValue(Node, 0), Control);
3403
92
  }
3404
280
3405
280
  // And finally, form the BEXTR itself.
3406
280
  SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control);
3407
280
3408
280
  // The 'X' was originally truncated. Do that now.
3409
280
  if (XVT != NVT) {
3410
24
    insertDAGNode(*CurDAG, SDValue(Node, 0), Extract);
3411
24
    Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract);
3412
24
  }
3413
280
3414
280
  ReplaceNode(Node, Extract.getNode());
3415
280
  SelectCode(Extract.getNode());
3416
280
3417
280
  return true;
3418
280
}
3419
3420
// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
3421
68.6k
MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) {
3422
68.6k
  MVT NVT = Node->getSimpleValueType(0);
3423
68.6k
  SDLoc dl(Node);
3424
68.6k
3425
68.6k
  SDValue N0 = Node->getOperand(0);
3426
68.6k
  SDValue N1 = Node->getOperand(1);
3427
68.6k
3428
68.6k
  // If we have TBM we can use an immediate for the control. If we have BMI
3429
68.6k
  // we should only do this if the BEXTR instruction is implemented well.
3430
68.6k
  // Otherwise moving the control into a register makes this more costly.
3431
68.6k
  // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM
3432
68.6k
  // hoisting the move immediate would make it worthwhile with a less optimal
3433
68.6k
  // BEXTR?
3434
68.6k
  if (!Subtarget->hasTBM() &&
3435
68.6k
      
!(67.7k
Subtarget->hasBMI()67.7k
&&
Subtarget->hasFastBEXTR()13.0k
))
3436
67.1k
    return nullptr;
3437
1.56k
3438
1.56k
  // Must have a shift right.
3439
1.56k
  if (N0->getOpcode() != ISD::SRL && 
N0->getOpcode() != ISD::SRA1.28k
)
3440
1.28k
    return nullptr;
3441
279
3442
279
  // Shift can't have additional users.
3443
279
  if (!N0->hasOneUse())
3444
90
    return nullptr;
3445
189
3446
189
  // Only supported for 32 and 64 bits.
3447
189
  if (NVT != MVT::i32 && 
NVT != MVT::i6448
)
3448
0
    return nullptr;
3449
189
3450
189
  // Shift amount and RHS of and must be constant.
3451
189
  ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
3452
189
  ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
3453
189
  if (!MaskCst || 
!ShiftCst153
)
3454
36
    return nullptr;
3455
153
3456
153
  // And RHS must be a mask.
3457
153
  uint64_t Mask = MaskCst->getZExtValue();
3458
153
  if (!isMask_64(Mask))
3459
68
    return nullptr;
3460
85
3461
85
  uint64_t Shift = ShiftCst->getZExtValue();
3462
85
  uint64_t MaskSize = countPopulation(Mask);
3463
85
3464
85
  // Don't interfere with something that can be handled by extracting AH.
3465
85
  // TODO: If we are able to fold a load, BEXTR might still be better than AH.
3466
85
  if (Shift == 8 && 
MaskSize == 85
)
3467
5
    return nullptr;
3468
80
3469
80
  // Make sure we are only using bits that were in the original value, not
3470
80
  // shifted in.
3471
80
  if (Shift + MaskSize > NVT.getSizeInBits())
3472
0
    return nullptr;
3473
80
3474
80
  SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
3475
80
  unsigned ROpc = NVT == MVT::i64 ? 
X86::BEXTRI64ri18
:
X86::BEXTRI32ri62
;
3476
80
  unsigned MOpc = NVT == MVT::i64 ? 
X86::BEXTRI64mi18
:
X86::BEXTRI32mi62
;
3477
80
3478
80
  // BMI requires the immediate to placed in a register.
3479
80
  if (!Subtarget->hasTBM()) {
3480
41
    ROpc = NVT == MVT::i64 ? 
X86::BEXTR64rr10
:
X86::BEXTR32rr31
;
3481
41
    MOpc = NVT == MVT::i64 ? 
X86::BEXTR64rm10
:
X86::BEXTR32rm31
;
3482
41
    unsigned NewOpc = NVT == MVT::i64 ? 
X86::MOV32ri6410
:
X86::MOV32ri31
;
3483
41
    New = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, New), 0);
3484
41
  }
3485
80
3486
80
  MachineSDNode *NewNode;
3487
80
  SDValue Input = N0->getOperand(0);
3488
80
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3489
80
  if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
3490
39
    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
3491
39
    SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
3492
39
    NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3493
39
    // Update the chain.
3494
39
    ReplaceUses(Input.getValue(1), SDValue(NewNode, 2));
3495
39
    // Record the mem-refs
3496
39
    CurDAG->setNodeMemRefs(NewNode, {cast<LoadSDNode>(Input)->getMemOperand()});
3497
41
  } else {
3498
41
    NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, New);
3499
41
  }
3500
80
3501
80
  return NewNode;
3502
80
}
3503
3504
// Emit a PCMISTR(I/M) instruction.
3505
MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc,
3506
                                             bool MayFoldLoad, const SDLoc &dl,
3507
140
                                             MVT VT, SDNode *Node) {
3508
140
  SDValue N0 = Node->getOperand(0);
3509
140
  SDValue N1 = Node->getOperand(1);
3510
140
  SDValue Imm = Node->getOperand(2);
3511
140
  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3512
140
  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3513
140
3514
140
  // Try to fold a load. No need to check alignment.
3515
140
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3516
140
  if (MayFoldLoad && 
tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)128
) {
3517
20
    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3518
20
                      N1.getOperand(0) };
3519
20
    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other);
3520
20
    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3521
20
    // Update the chain.
3522
20
    ReplaceUses(N1.getValue(1), SDValue(CNode, 2));
3523
20
    // Record the mem-refs
3524
20
    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
3525
20
    return CNode;
3526
20
  }
3527
120
3528
120
  SDValue Ops[] = { N0, N1, Imm };
3529
120
  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32);
3530
120
  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3531
120
  return CNode;
3532
120
}
3533
3534
// Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need
3535
// to emit a second instruction after this one. This is needed since we have two
3536
// copyToReg nodes glued before this and we need to continue that glue through.
3537
MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc,
3538
                                             bool MayFoldLoad, const SDLoc &dl,
3539
                                             MVT VT, SDNode *Node,
3540
138
                                             SDValue &InFlag) {
3541
138
  SDValue N0 = Node->getOperand(0);
3542
138
  SDValue N2 = Node->getOperand(2);
3543
138
  SDValue Imm = Node->getOperand(4);
3544
138
  const ConstantInt *Val = cast<ConstantSDNode>(Imm)->getConstantIntValue();
3545
138
  Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType());
3546
138
3547
138
  // Try to fold a load. No need to check alignment.
3548
138
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
3549
138
  if (MayFoldLoad && 
tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)130
) {
3550
20
    SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
3551
20
                      N2.getOperand(0), InFlag };
3552
20
    SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue);
3553
20
    MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
3554
20
    InFlag = SDValue(CNode, 3);
3555
20
    // Update the chain.
3556
20
    ReplaceUses(N2.getValue(1), SDValue(CNode, 2));
3557
20
    // Record the mem-refs
3558
20
    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N2)->getMemOperand()});
3559
20
    return CNode;
3560
20
  }
3561
118
3562
118
  SDValue Ops[] = { N0, N2, Imm, InFlag };
3563
118
  SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue);
3564
118
  MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops);
3565
118
  InFlag = SDValue(CNode, 2);
3566
118
  return CNode;
3567
118
}
3568
3569
43.2k
bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
3570
43.2k
  EVT VT = N->getValueType(0);
3571
43.2k
3572
43.2k
  // Only handle scalar shifts.
3573
43.2k
  if (VT.isVector())
3574
0
    return false;
3575
43.2k
3576
43.2k
  // Narrower shifts only mask to 5 bits in hardware.
3577
43.2k
  unsigned Size = VT == MVT::i64 ? 
6430.1k
:
3213.0k
;
3578
43.2k
3579
43.2k
  SDValue OrigShiftAmt = N->getOperand(1);
3580
43.2k
  SDValue ShiftAmt = OrigShiftAmt;
3581
43.2k
  SDLoc DL(N);
3582
43.2k
3583
43.2k
  // Skip over a truncate of the shift amount.
3584
43.2k
  if (ShiftAmt->getOpcode() == ISD::TRUNCATE)
3585
2.47k
    ShiftAmt = ShiftAmt->getOperand(0);
3586
43.2k
3587
43.2k
  // This function is called after X86DAGToDAGISel::matchBitExtract(),
3588
43.2k
  // so we are not afraid that we might mess up BZHI/BEXTR pattern.
3589
43.2k
3590
43.2k
  SDValue NewShiftAmt;
3591
43.2k
  if (ShiftAmt->getOpcode() == ISD::ADD || 
ShiftAmt->getOpcode() == ISD::SUB42.4k
) {
3592
1.84k
    SDValue Add0 = ShiftAmt->getOperand(0);
3593
1.84k
    SDValue Add1 = ShiftAmt->getOperand(1);
3594
1.84k
    // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X
3595
1.84k
    // to avoid the ADD/SUB.
3596
1.84k
    if (isa<ConstantSDNode>(Add1) &&
3597
1.84k
        
cast<ConstantSDNode>(Add1)->getZExtValue() % Size == 0699
) {
3598
78
      NewShiftAmt = Add0;
3599
78
    // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to
3600
78
    // generate a NEG instead of a SUB of a constant.
3601
1.76k
    } else if (ShiftAmt->getOpcode() == ISD::SUB &&
3602
1.76k
               
isa<ConstantSDNode>(Add0)1.10k
&&
3603
1.76k
               
cast<ConstantSDNode>(Add0)->getZExtValue() != 01.08k
&&
3604
1.76k
               
cast<ConstantSDNode>(Add0)->getZExtValue() % Size == 01.08k
) {
3605
603
      // Insert a negate op.
3606
603
      // TODO: This isn't guaranteed to replace the sub if there is a logic cone
3607
603
      // that uses it that's not a shift.
3608
603
      EVT SubVT = ShiftAmt.getValueType();
3609
603
      SDValue Zero = CurDAG->getConstant(0, DL, SubVT);
3610
603
      SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1);
3611
603
      NewShiftAmt = Neg;
3612
603
3613
603
      // Insert these operands into a valid topological order so they can
3614
603
      // get selected independently.
3615
603
      insertDAGNode(*CurDAG, OrigShiftAmt, Zero);
3616
603
      insertDAGNode(*CurDAG, OrigShiftAmt, Neg);
3617
603
    } else
3618
1.16k
      return false;
3619
41.3k
  } else
3620
41.3k
    return false;
3621
681
3622
681
  if (NewShiftAmt.getValueType() != MVT::i8) {
3623
0
    // Need to truncate the shift amount.
3624
0
    NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt);
3625
0
    // Add to a correct topological ordering.
3626
0
    insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3627
0
  }
3628
681
3629
681
  // Insert a new mask to keep the shift amount legal. This should be removed
3630
681
  // by isel patterns.
3631
681
  NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt,
3632
681
                                CurDAG->getConstant(Size - 1, DL, MVT::i8));
3633
681
  // Place in a correct topological ordering.
3634
681
  insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt);
3635
681
3636
681
  SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0),
3637
681
                                                   NewShiftAmt);
3638
681
  if (UpdatedNode != N) {
3639
0
    // If we found an existing node, we should replace ourselves with that node
3640
0
    // and wait for it to be selected after its other users.
3641
0
    ReplaceNode(N, UpdatedNode);
3642
0
    return true;
3643
0
  }
3644
681
3645
681
  // If the original shift amount is now dead, delete it so that we don't run
3646
681
  // it through isel.
3647
681
  if (OrigShiftAmt.getNode()->use_empty())
3648
535
    CurDAG->RemoveDeadNode(OrigShiftAmt.getNode());
3649
681
3650
681
  // Now that we've optimized the shift amount, defer to normal isel to get
3651
681
  // load folding and legacy vs BMI2 selection without repeating it here.
3652
681
  SelectCode(N);
3653
681
  return true;
3654
681
}
3655
3656
95.2k
bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
3657
95.2k
  MVT NVT = N->getSimpleValueType(0);
3658
95.2k
  unsigned Opcode = N->getOpcode();
3659
95.2k
  SDLoc dl(N);
3660
95.2k
3661
95.2k
  // For operations of the form (x << C1) op C2, check if we can use a smaller
3662
95.2k
  // encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
3663
95.2k
  SDValue Shift = N->getOperand(0);
3664
95.2k
  SDValue N1 = N->getOperand(1);
3665
95.2k
3666
95.2k
  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
3667
95.2k
  if (!Cst)
3668
63.6k
    return false;
3669
31.6k
3670
31.6k
  int64_t Val = Cst->getSExtValue();
3671
31.6k
3672
31.6k
  // If we have an any_extend feeding the AND, look through it to see if there
3673
31.6k
  // is a shift behind it. But only if the AND doesn't use the extended bits.
3674
31.6k
  // FIXME: Generalize this to other ANY_EXTEND than i32 to i64?
3675
31.6k
  bool FoundAnyExtend = false;
3676
31.6k
  if (Shift.getOpcode() == ISD::ANY_EXTEND && 
Shift.hasOneUse()1.07k
&&
3677
31.6k
      
Shift.getOperand(0).getSimpleValueType() == MVT::i321.02k
&&
3678
31.6k
      
isUInt<32>(Val)686
) {
3679
673
    FoundAnyExtend = true;
3680
673
    Shift = Shift.getOperand(0);
3681
673
  }
3682
31.6k
3683
31.6k
  if (Shift.getOpcode() != ISD::SHL || 
!Shift.hasOneUse()1.45k
)
3684
30.2k
    return false;
3685
1.36k
3686
1.36k
  // i8 is unshrinkable, i16 should be promoted to i32.
3687
1.36k
  if (NVT != MVT::i32 && 
NVT != MVT::i64875
)
3688
19
    return false;
3689
1.34k
3690
1.34k
  ConstantSDNode *ShlCst = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
3691
1.34k
  if (!ShlCst)
3692
550
    return false;
3693
798
3694
798
  uint64_t ShAmt = ShlCst->getZExtValue();
3695
798
3696
798
  // Make sure that we don't change the operation by removing bits.
3697
798
  // This only matters for OR and XOR, AND is unaffected.
3698
798
  uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1;
3699
798
  if (Opcode != ISD::AND && 
(Val & RemovedBitsMask) != 0324
)
3700
215
    return false;
3701
583
3702
583
  // Check the minimum bitwidth for the new constant.
3703
583
  // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32.
3704
583
  auto CanShrinkImmediate = [&](int64_t &ShiftedVal) {
3705
583
    if (Opcode == ISD::AND) {
3706
474
      // AND32ri is the same as AND64ri32 with zext imm.
3707
474
      // Try this before sign extended immediates below.
3708
474
      ShiftedVal = (uint64_t)Val >> ShAmt;
3709
474
      if (NVT == MVT::i64 && 
!isUInt<32>(Val)388
&&
isUInt<32>(ShiftedVal)344
)
3710
116
        return true;
3711
358
      // Also swap order when the AND can become MOVZX.
3712
358
      if (ShiftedVal == UINT8_MAX || 
ShiftedVal == UINT16_MAX354
)
3713
358
        
return true6
;
3714
461
    }
3715
461
    ShiftedVal = Val >> ShAmt;
3716
461
    if ((!isInt<8>(Val) && 
isInt<8>(ShiftedVal)402
) ||
3717
461
        
(428
!isInt<32>(Val)428
&&
isInt<32>(ShiftedVal)230
))
3718
36
      return true;
3719
425
    if (Opcode != ISD::AND) {
3720
100
      // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr
3721
100
      ShiftedVal = (uint64_t)Val >> ShAmt;
3722
100
      if (NVT == MVT::i64 && 
!isUInt<32>(Val)33
&&
isUInt<32>(ShiftedVal)23
)
3723
2
        return true;
3724
423
    }
3725
423
    return false;
3726
423
  };
3727
583
3728
583
  int64_t ShiftedVal;
3729
583
  if (!CanShrinkImmediate(ShiftedVal))
3730
423
    return false;
3731
160
3732
160
  // Ok, we can reorder to get a smaller immediate.
3733
160
3734
160
  // But, its possible the original immediate allowed an AND to become MOVZX.
3735
160
  // Doing this late due to avoid the MakedValueIsZero call as late as
3736
160
  // possible.
3737
160
  if (Opcode == ISD::AND) {
3738
149
    // Find the smallest zext this could possibly be.
3739
149
    unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits();
3740
149
    ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U));
3741
149
3742
149
    // Figure out which bits need to be zero to achieve that mask.
3743
149
    APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(),
3744
149
                                            ZExtWidth);
3745
149
    NeededMask &= ~Cst->getAPIntValue();
3746
149
3747
149
    if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask))
3748
2
      return false;
3749
158
  }
3750
158
3751
158
  SDValue X = Shift.getOperand(0);
3752
158
  if (FoundAnyExtend) {
3753
2
    SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X);
3754
2
    insertDAGNode(*CurDAG, SDValue(N, 0), NewX);
3755
2
    X = NewX;
3756
2
  }
3757
158
3758
158
  SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT);
3759
158
  insertDAGNode(*CurDAG, SDValue(N, 0), NewCst);
3760
158
  SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst);
3761
158
  insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp);
3762
158
  SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp,
3763
158
                                   Shift.getOperand(1));
3764
158
  ReplaceNode(N, NewSHL.getNode());
3765
158
  SelectCode(NewSHL.getNode());
3766
158
  return true;
3767
158
}
3768
3769
/// If the high bits of an 'and' operand are known zero, try setting the
3770
/// high bits of an 'and' constant operand to produce a smaller encoding by
3771
/// creating a small, sign-extended negative immediate rather than a large
3772
/// positive one. This reverses a transform in SimplifyDemandedBits that
3773
/// shrinks mask constants by clearing bits. There is also a possibility that
3774
/// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that
3775
/// case, just replace the 'and'. Return 'true' if the node is replaced.
3776
51.8k
bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
3777
51.8k
  // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't
3778
51.8k
  // have immediate operands.
3779
51.8k
  MVT VT = And->getSimpleValueType(0);
3780
51.8k
  if (VT != MVT::i32 && 
VT != MVT::i6440.9k
)
3781
22.0k
    return false;
3782
29.7k
3783
29.7k
  auto *And1C = dyn_cast<ConstantSDNode>(And->getOperand(1));
3784
29.7k
  if (!And1C)
3785
9.97k
    return false;
3786
19.7k
3787
19.7k
  // Bail out if the mask constant is already negative. It's can't shrink more.
3788
19.7k
  // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel
3789
19.7k
  // patterns to use a 32-bit and instead of a 64-bit and by relying on the
3790
19.7k
  // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits
3791
19.7k
  // are negative too.
3792
19.7k
  APInt MaskVal = And1C->getAPIntValue();
3793
19.7k
  unsigned MaskLZ = MaskVal.countLeadingZeros();
3794
19.7k
  if (!MaskLZ || 
(17.0k
VT == MVT::i6417.0k
&&
MaskLZ == 329.81k
))
3795
3.04k
    return false;
3796
16.7k
3797
16.7k
  // Don't extend into the upper 32 bits of a 64 bit mask.
3798
16.7k
  if (VT == MVT::i64 && 
MaskLZ >= 329.50k
) {
3799
6.28k
    MaskLZ -= 32;
3800
6.28k
    MaskVal = MaskVal.trunc(32);
3801
6.28k
  }
3802
16.7k
3803
16.7k
  SDValue And0 = And->getOperand(0);
3804
16.7k
  APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ);
3805
16.7k
  APInt NegMaskVal = MaskVal | HighZeros;
3806
16.7k
3807
16.7k
  // If a negative constant would not allow a smaller encoding, there's no need
3808
16.7k
  // to continue. Only change the constant when we know it's a win.
3809
16.7k
  unsigned MinWidth = NegMaskVal.getMinSignedBits();
3810
16.7k
  if (MinWidth > 32 || 
(14.9k
MinWidth > 814.9k
&&
MaskVal.getMinSignedBits() <= 32865
))
3811
2.61k
    return false;
3812
14.1k
3813
14.1k
  // Extend masks if we truncated above.
3814
14.1k
  if (VT == MVT::i64 && 
MaskVal.getBitWidth() < 647.70k
) {
3815
6.25k
    NegMaskVal = NegMaskVal.zext(64);
3816
6.25k
    HighZeros = HighZeros.zext(64);
3817
6.25k
  }
3818
14.1k
3819
14.1k
  // The variable operand must be all zeros in the top bits to allow using the
3820
14.1k
  // new, negative constant as the mask.
3821
14.1k
  if (!CurDAG->MaskedValueIsZero(And0, HighZeros))
3822
13.8k
    return false;
3823
306
3824
306
  // Check if the mask is -1. In that case, this is an unnecessary instruction
3825
306
  // that escaped earlier analysis.
3826
306
  if (NegMaskVal.isAllOnesValue()) {
3827
19
    ReplaceNode(And, And0.getNode());
3828
19
    return true;
3829
19
  }
3830
287
3831
287
  // A negative mask allows a smaller encoding. Create a new 'and' node.
3832
287
  SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT);
3833
287
  SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask);
3834
287
  ReplaceNode(And, NewAnd.getNode());
3835
287
  SelectCode(NewAnd.getNode());
3836
287
  return true;
3837
287
}
3838
3839
static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
3840
4.38k
                              bool FoldedBCast, bool Masked) {
3841
4.38k
  if (Masked) {
3842
152
    if (FoldedLoad) {
3843
0
      switch (TestVT.SimpleTy) {
3844
0
      default: llvm_unreachable("Unexpected VT!");
3845
0
      case MVT::v16i8:
3846
0
        return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
3847
0
      case MVT::v8i16:
3848
0
        return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
3849
0
      case MVT::v4i32:
3850
0
        return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
3851
0
      case MVT::v2i64:
3852
0
        return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
3853
0
      case MVT::v32i8:
3854
0
        return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
3855
0
      case MVT::v16i16:
3856
0
        return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
3857
0
      case MVT::v8i32:
3858
0
        return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
3859
0
      case MVT::v4i64:
3860
0
        return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
3861
0
      case MVT::v64i8:
3862
0
        return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
3863
0
      case MVT::v32i16:
3864
0
        return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
3865
0
      case MVT::v16i32:
3866
0
        return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
3867
0
      case MVT::v8i64:
3868
0
        return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
3869
152
      }
3870
152
    }
3871
152
3872
152
    if (FoldedBCast) {
3873
0
      switch (TestVT.SimpleTy) {
3874
0
      default: llvm_unreachable("Unexpected VT!");
3875
0
      case MVT::v4i32:
3876
0
        return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
3877
0
      case MVT::v2i64:
3878
0
        return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
3879
0
      case MVT::v8i32:
3880
0
        return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
3881
0
      case MVT::v4i64:
3882
0
        return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
3883
0
      case MVT::v16i32:
3884
0
        return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
3885
0
      case MVT::v8i64:
3886
0
        return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
3887
152
      }
3888
152
    }
3889
152
3890
152
    switch (TestVT.SimpleTy) {
3891
152
    
default: 0
llvm_unreachable0
("Unexpected VT!");
3892
152
    case MVT::v16i8:
3893
4
      return IsTestN ? 
X86::VPTESTNMBZ128rrk2
:
X86::VPTESTMBZ128rrk2
;
3894
152
    case MVT::v8i16:
3895
4
      return IsTestN ? 
X86::VPTESTNMWZ128rrk2
:
X86::VPTESTMWZ128rrk2
;
3896
152
    case MVT::v4i32:
3897
10
      return IsTestN ? 
X86::VPTESTNMDZ128rrk6
:
X86::VPTESTMDZ128rrk4
;
3898
152
    case MVT::v2i64:
3899
8
      return IsTestN ? 
X86::VPTESTNMQZ128rrk4
:
X86::VPTESTMQZ128rrk4
;
3900
152
    case MVT::v32i8:
3901
4
      return IsTestN ? 
X86::VPTESTNMBZ256rrk2
:
X86::VPTESTMBZ256rrk2
;
3902
152
    case MVT::v16i16:
3903
4
      return IsTestN ? 
X86::VPTESTNMWZ256rrk2
:
X86::VPTESTMWZ256rrk2
;
3904
152
    case MVT::v8i32:
3905
4
      return IsTestN ? 
X86::VPTESTNMDZ256rrk2
:
X86::VPTESTMDZ256rrk2
;
3906
152
    case MVT::v4i64:
3907
8
      return IsTestN ? 
X86::VPTESTNMQZ256rrk4
:
X86::VPTESTMQZ256rrk4
;
3908
152
    case MVT::v64i8:
3909
6
      return IsTestN ? 
X86::VPTESTNMBZrrk5
:
X86::VPTESTMBZrrk1
;
3910
152
    case MVT::v32i16:
3911
8
      return IsTestN ? 
X86::VPTESTNMWZrrk6
:
X86::VPTESTMWZrrk2
;
3912
152
    case MVT::v16i32:
3913
39
      return IsTestN ? 
X86::VPTESTNMDZrrk13
:
X86::VPTESTMDZrrk26
;
3914
152
    case MVT::v8i64:
3915
53
      return IsTestN ? 
X86::VPTESTNMQZrrk14
:
X86::VPTESTMQZrrk39
;
3916
4.23k
    }
3917
4.23k
  }
3918
4.23k
3919
4.23k
  if (FoldedLoad) {
3920
30
    switch (TestVT.SimpleTy) {
3921
30
    
default: 0
llvm_unreachable0
("Unexpected VT!");
3922
30
    case MVT::v16i8:
3923
4
      return IsTestN ? 
X86::VPTESTNMBZ128rm0
: X86::VPTESTMBZ128rm;
3924
30
    case MVT::v8i16:
3925
4
      return IsTestN ? 
X86::VPTESTNMWZ128rm0
: X86::VPTESTMWZ128rm;
3926
30
    case MVT::v4i32:
3927
0
      return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
3928
30
    case MVT::v2i64:
3929
4
      return IsTestN ? 
X86::VPTESTNMQZ128rm0
: X86::VPTESTMQZ128rm;
3930
30
    case MVT::v32i8:
3931
4
      return IsTestN ? 
X86::VPTESTNMBZ256rm0
: X86::VPTESTMBZ256rm;
3932
30
    case MVT::v16i16:
3933
4
      return IsTestN ? 
X86::VPTESTNMWZ256rm0
: X86::VPTESTMWZ256rm;
3934
30
    case MVT::v8i32:
3935
0
      return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
3936
30
    case MVT::v4i64:
3937
0
      return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
3938
30
    case MVT::v64i8:
3939
4
      return IsTestN ? 
X86::VPTESTNMBZrm0
: X86::VPTESTMBZrm;
3940
30
    case MVT::v32i16:
3941
4
      return IsTestN ? 
X86::VPTESTNMWZrm0
: X86::VPTESTMWZrm;
3942
30
    case MVT::v16i32:
3943
0
      return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
3944
30
    case MVT::v8i64:
3945
2
      return IsTestN ? 
X86::VPTESTNMQZrm0
: X86::VPTESTMQZrm;
3946
4.20k
    }
3947
4.20k
  }
3948
4.20k
3949
4.20k
  if (FoldedBCast) {
3950
44
    switch (TestVT.SimpleTy) {
3951
44
    
default: 0
llvm_unreachable0
("Unexpected VT!");
3952
44
    case MVT::v4i32:
3953
6
      return IsTestN ? 
X86::VPTESTNMDZ128rmb0
: X86::VPTESTMDZ128rmb;
3954
44
    case MVT::v2i64:
3955
0
      return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
3956
44
    case MVT::v8i32:
3957
4
      return IsTestN ? 
X86::VPTESTNMDZ256rmb0
: X86::VPTESTMDZ256rmb;
3958
44
    case MVT::v4i64:
3959
4
      return IsTestN ? 
X86::VPTESTNMQZ256rmb0
: X86::VPTESTMQZ256rmb;
3960
44
    case MVT::v16i32:
3961
18
      return IsTestN ? 
X86::VPTESTNMDZrmb1
:
X86::VPTESTMDZrmb17
;
3962
44
    case MVT::v8i64:
3963
12
      return IsTestN ? 
X86::VPTESTNMQZrmb0
: X86::VPTESTMQZrmb;
3964
4.16k
    }
3965
4.16k
  }
3966
4.16k
3967
4.16k
  switch (TestVT.SimpleTy) {
3968
4.16k
  
default: 0
llvm_unreachable0
("Unexpected VT!");
3969
4.16k
  case MVT::v16i8:
3970
94
    return IsTestN ? 
X86::VPTESTNMBZ128rr82
:
X86::VPTESTMBZ128rr12
;
3971
4.16k
  case MVT::v8i16:
3972
130
    return IsTestN ? 
X86::VPTESTNMWZ128rr113
:
X86::VPTESTMWZ128rr17
;
3973
4.16k
  case MVT::v4i32:
3974
317
    return IsTestN ? 
X86::VPTESTNMDZ128rr144
:
X86::VPTESTMDZ128rr173
;
3975
4.16k
  case MVT::v2i64:
3976
167
    return IsTestN ? 
X86::VPTESTNMQZ128rr92
:
X86::VPTESTMQZ128rr75
;
3977
4.16k
  case MVT::v32i8:
3978
66
    return IsTestN ? 
X86::VPTESTNMBZ256rr53
:
X86::VPTESTMBZ256rr13
;
3979
4.16k
  case MVT::v16i16:
3980
108
    return IsTestN ? 
X86::VPTESTNMWZ256rr96
:
X86::VPTESTMWZ256rr12
;
3981
4.16k
  case MVT::v8i32:
3982
430
    return IsTestN ? 
X86::VPTESTNMDZ256rr212
:
X86::VPTESTMDZ256rr218
;
3983
4.16k
  case MVT::v4i64:
3984
169
    return IsTestN ? 
X86::VPTESTNMQZ256rr141
:
X86::VPTESTMQZ256rr28
;
3985
4.16k
  case MVT::v64i8:
3986
152
    return IsTestN ? 
X86::VPTESTNMBZrr111
:
X86::VPTESTMBZrr41
;
3987
4.16k
  case MVT::v32i16:
3988
158
    return IsTestN ? 
X86::VPTESTNMWZrr127
:
X86::VPTESTMWZrr31
;
3989
4.16k
  case MVT::v16i32:
3990
1.80k
    return IsTestN ? 
X86::VPTESTNMDZrr350
:
X86::VPTESTMDZrr1.45k
;
3991
4.16k
  case MVT::v8i64:
3992
566
    return IsTestN ? 
X86::VPTESTNMQZrr255
:
X86::VPTESTMQZrr311
;
3993
4.16k
  }
3994
4.16k
}
3995
3996
// Try to create VPTESTM instruction. If InMask is not null, it will be used
3997
// to form a masked operation.
3998
bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc,
3999
8.47k
                                 SDValue InMask) {
4000
8.47k
  assert(Subtarget->hasAVX512() && "Expected AVX512!");
4001
8.47k
  assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 &&
4002
8.47k
         "Unexpected VT!");
4003
8.47k
4004
8.47k
  // Look for equal and not equal compares.
4005
8.47k
  ISD::CondCode CC = cast<CondCodeSDNode>(Setcc.getOperand(2))->get();
4006
8.47k
  if (CC != ISD::SETEQ && 
CC != ISD::SETNE5.99k
)
4007
3.28k
    return false;
4008
5.19k
4009
5.19k
  // See if we're comparing against zero. This should have been canonicalized
4010
5.19k
  // to RHS during lowering.
4011
5.19k
  if (!ISD::isBuildVectorAllZeros(Setcc.getOperand(1).getNode()))
4012
803
    return false;
4013
4.38k
4014
4.38k
  SDValue N0 = Setcc.getOperand(0);
4015
4.38k
4016
4.38k
  MVT CmpVT = N0.getSimpleValueType();
4017
4.38k
  MVT CmpSVT = CmpVT.getVectorElementType();
4018
4.38k
4019
4.38k
  // Start with both operands the same. We'll try to refine this.
4020
4.38k
  SDValue Src0 = N0;
4021
4.38k
  SDValue Src1 = N0;
4022
4.38k
4023
4.38k
  {
4024
4.38k
    // Look through single use bitcasts.
4025
4.38k
    SDValue N0Temp = N0;
4026
4.38k
    if (N0Temp.getOpcode() == ISD::BITCAST && 
N0Temp.hasOneUse()192
)
4027
190
      N0Temp = N0.getOperand(0);
4028
4.38k
4029
4.38k
     // Look for single use AND.
4030
4.38k
    if (N0Temp.getOpcode() == ISD::AND && 
N0Temp.hasOneUse()541
) {
4031
345
      Src0 = N0Temp.getOperand(0);
4032
345
      Src1 = N0Temp.getOperand(1);
4033
345
    }
4034
4.38k
  }
4035
4.38k
4036
4.38k
  // Without VLX we need to widen the load.
4037
4.38k
  bool Widen = !Subtarget->hasVLX() && 
!CmpVT.is512BitVector()1.95k
;
4038
4.38k
4039
4.38k
  // We can only fold loads if the sources are unique.
4040
4.38k
  bool CanFoldLoads = Src0 != Src1;
4041
4.38k
4042
4.38k
  // Try to fold loads unless we need to widen.
4043
4.38k
  bool FoldedLoad = false;
4044
4.38k
  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Load;
4045
4.38k
  if (!Widen && 
CanFoldLoads3.73k
) {
4046
276
    Load = Src1;
4047
276
    FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2, Tmp3,
4048
276
                             Tmp4);
4049
276
    if (!FoldedLoad) {
4050
247
      // And is computative.
4051
247
      Load = Src0;
4052
247
      FoldedLoad = tryFoldLoad(Root, N0.getNode(), Load, Tmp0, Tmp1, Tmp2,
4053
247
                               Tmp3, Tmp4);
4054
247
      if (FoldedLoad)
4055
1
        std::swap(Src0, Src1);
4056
247
    }
4057
276
  }
4058
4.38k
4059
4.38k
  auto findBroadcastedOp = [](SDValue Src, MVT CmpSVT, SDNode *&Parent) {
4060
350
    // Look through single use bitcasts.
4061
350
    if (Src.getOpcode() == ISD::BITCAST && 
Src.hasOneUse()0
)
4062
0
      Src = Src.getOperand(0);
4063
350
4064
350
    if (Src.getOpcode() == X86ISD::VBROADCAST && 
Src.hasOneUse()44
) {
4065
44
      Parent = Src.getNode();
4066
44
      Src = Src.getOperand(0);
4067
44
      if (Src.getSimpleValueType() == CmpSVT)
4068
44
        return Src;
4069
306
    }
4070
306
4071
306
    return SDValue();
4072
306
  };
4073
4.38k
4074
4.38k
  // If we didn't fold a load, try to match broadcast. No widening limitation
4075
4.38k
  // for this. But only 32 and 64 bit types are supported.
4076
4.38k
  bool FoldedBCast = false;
4077
4.38k
  if (!FoldedLoad && 
CanFoldLoads4.35k
&&
4078
4.38k
      
(315
CmpSVT == MVT::i32315
||
CmpSVT == MVT::i64212
)) {
4079
197
    SDNode *ParentNode = nullptr;
4080
197
    if ((Load = findBroadcastedOp(Src1, CmpSVT, ParentNode))) {
4081
44
      FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4082
44
                                Tmp1, Tmp2, Tmp3, Tmp4);
4083
44
    }
4084
197
4085
197
    // Try the other operand.
4086
197
    if (!FoldedBCast) {
4087
153
      if ((Load = findBroadcastedOp(Src0, CmpSVT, ParentNode))) {
4088
0
        FoldedBCast = tryFoldLoad(Root, ParentNode, Load, Tmp0,
4089
0
                                  Tmp1, Tmp2, Tmp3, Tmp4);
4090
0
        if (FoldedBCast)
4091
0
          std::swap(Src0, Src1);
4092
0
      }
4093
153
    }
4094
197
  }
4095
4.38k
4096
4.38k
  auto getMaskRC = [](MVT MaskVT) {
4097
663
    switch (MaskVT.SimpleTy) {
4098
663
    
default: 0
llvm_unreachable0
("Unexpected VT!");
4099
663
    
case MVT::v2i1: return X86::VK2RegClassID117
;
4100
663
    
case MVT::v4i1: return X86::VK4RegClassID227
;
4101
663
    
case MVT::v8i1: return X86::VK8RegClassID260
;
4102
663
    
case MVT::v16i1: return X86::VK16RegClassID39
;
4103
663
    
case MVT::v32i1: return X86::VK32RegClassID20
;
4104
663
    
case MVT::v64i1: return X86::VK64RegClassID0
;
4105
663
    }
4106
663
  };
4107
4.38k
4108
4.38k
  bool IsMasked = InMask.getNode() != nullptr;
4109
4.38k
4110
4.38k
  SDLoc dl(Root);
4111
4.38k
4112
4.38k
  MVT ResVT = Setcc.getSimpleValueType();
4113
4.38k
  MVT MaskVT = ResVT;
4114
4.38k
  if (Widen) {
4115
649
    // Widen the inputs using insert_subreg or copy_to_regclass.
4116
649
    unsigned Scale = CmpVT.is128BitVector() ? 
4341
:
2308
;
4117
649
    unsigned SubReg = CmpVT.is128BitVector() ? 
X86::sub_xmm341
:
X86::sub_ymm308
;
4118
649
    unsigned NumElts = CmpVT.getVectorNumElements() * Scale;
4119
649
    CmpVT = MVT::getVectorVT(CmpSVT, NumElts);
4120
649
    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4121
649
    SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl,
4122
649
                                                     CmpVT), 0);
4123
649
    Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0);
4124
649
4125
649
    assert(!FoldedLoad && "Shouldn't have folded the load");
4126
649
    if (!FoldedBCast)
4127
635
      Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1);
4128
649
4129
649
    if (IsMasked) {
4130
14
      // Widen the mask.
4131
14
      unsigned RegClass = getMaskRC(MaskVT);
4132
14
      SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4133
14
      InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4134
14
                                              dl, MaskVT, InMask, RC), 0);
4135
14
    }
4136
649
  }
4137
4.38k
4138
4.38k
  bool IsTestN = CC == ISD::SETEQ;
4139
4.38k
  unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast,
4140
4.38k
                               IsMasked);
4141
4.38k
4142
4.38k
  MachineSDNode *CNode;
4143
4.38k
  if (FoldedLoad || 
FoldedBCast4.35k
) {
4144
74
    SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other);
4145
74
4146
74
    if (IsMasked) {
4147
0
      SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4148
0
                        Load.getOperand(0) };
4149
0
      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4150
74
    } else {
4151
74
      SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4,
4152
74
                        Load.getOperand(0) };
4153
74
      CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4154
74
    }
4155
74
4156
74
    // Update the chain.
4157
74
    ReplaceUses(Load.getValue(1), SDValue(CNode, 1));
4158
74
    // Record the mem-refs
4159
74
    CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(Load)->getMemOperand()});
4160
4.31k
  } else {
4161
4.31k
    if (IsMasked)
4162
152
      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1);
4163
4.16k
    else
4164
4.16k
      CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1);
4165
4.31k
  }
4166
4.38k
4167
4.38k
  // If we widened, we need to shrink the mask VT.
4168
4.38k
  if (Widen) {
4169
649
    unsigned RegClass = getMaskRC(ResVT);
4170
649
    SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32);
4171
649
    CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
4172
649
                                   dl, ResVT, SDValue(CNode, 0), RC);
4173
649
  }
4174
4.38k
4175
4.38k
  ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0));
4176
4.38k
  CurDAG->RemoveDeadNode(Root);
4177
4.38k
  return true;
4178
4.38k
}
4179
4180
6.28M
void X86DAGToDAGISel::Select(SDNode *Node) {
4181
6.28M
  MVT NVT = Node->getSimpleValueType(0);
4182
6.28M
  unsigned Opcode = Node->getOpcode();
4183
6.28M
  SDLoc dl(Node);
4184
6.28M
4185
6.28M
  if (Node->isMachineOpcode()) {
4186
2.30k
    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
4187
2.30k
    Node->setNodeId(-1);
4188
2.30k
    return;   // Already selected.
4189
2.30k
  }
4190
6.28M
4191
6.28M
  switch (Opcode) {
4192
6.28M
  
default: break5.57M
;
4193
6.28M
  case ISD::INTRINSIC_VOID: {
4194
462
    unsigned IntNo = Node->getConstantOperandVal(1);
4195
462
    switch (IntNo) {
4196
462
    
default: break447
;
4197
462
    case Intrinsic::x86_sse3_monitor:
4198
15
    case Intrinsic::x86_monitorx:
4199
15
    case Intrinsic::x86_clzero: {
4200
15
      bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;
4201
15
4202
15
      unsigned Opc = 0;
4203
15
      switch (IntNo) {
4204
15
      case Intrinsic::x86_sse3_monitor:
4205
9
        if (!Subtarget->hasSSE3())
4206
0
          break;
4207
9
        Opc = Use64BitPtr ? 
X86::MONITOR64rrr5
:
X86::MONITOR32rrr4
;
4208
9
        break;
4209
9
      case Intrinsic::x86_monitorx:
4210
4
        if (!Subtarget->hasMWAITX())
4211
0
          break;
4212
4
        Opc = Use64BitPtr ? X86::MONITORX64rrr : 
X86::MONITORX32rrr0
;
4213
4
        break;
4214
4
      case Intrinsic::x86_clzero:
4215
2
        if (!Subtarget->hasCLZERO())
4216
0
          break;
4217
2
        Opc = Use64BitPtr ? 
X86::CLZERO64r1
:
X86::CLZERO32r1
;
4218
2
        break;
4219
15
      }
4220
15
4221
15
      if (Opc) {
4222
15
        unsigned PtrReg = Use64BitPtr ? 
X86::RAX10
:
X86::EAX5
;
4223
15
        SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
4224
15
                                             Node->getOperand(2), SDValue());
4225
15
        SDValue InFlag = Chain.getValue(1);
4226
15
4227
15
        if (IntNo == Intrinsic::x86_sse3_monitor ||
4228
15
            
IntNo == Intrinsic::x86_monitorx6
) {
4229
13
          // Copy the other two operands to ECX and EDX.
4230
13
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
4231
13
                                       InFlag);
4232
13
          InFlag = Chain.getValue(1);
4233
13
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
4234
13
                                       InFlag);
4235
13
          InFlag = Chain.getValue(1);
4236
13
        }
4237
15
4238
15
        MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
4239
15
                                                      { Chain, InFlag});
4240
15
        ReplaceNode(Node, CNode);
4241
15
        return;
4242
15
      }
4243
447
    }
4244
447
    }
4245
447
4246
447
    break;
4247
447
  }
4248
447
  case ISD::BRIND: {
4249
363
    if (Subtarget->isTargetNaCl())
4250
0
      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
4251
0
      // leave the instruction alone.
4252
0
      break;
4253
363
    if (Subtarget->isTarget64BitILP32()) {
4254
2
      // Converts a 32-bit register to a 64-bit, zero-extended version of
4255
2
      // it. This is needed because x86-64 can do many things, but jmp %r32
4256
2
      // ain't one of them.
4257
2
      const SDValue &Target = Node->getOperand(1);
4258
2
      assert(Target.getSimpleValueType() == llvm::MVT::i32);
4259
2
      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
4260
2
      SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
4261
2
                                      Node->getOperand(0), ZextTarget);
4262
2
      ReplaceNode(Node, Brind.getNode());
4263
2
      SelectCode(ZextTarget.getNode());
4264
2
      SelectCode(Brind.getNode());
4265
2
      return;
4266
2
    }
4267
361
    break;
4268
361
  }
4269
8.61k
  case X86ISD::GlobalBaseReg:
4270
8.61k
    ReplaceNode(Node, getGlobalBaseReg());
4271
8.61k
    return;
4272
361
4273
114k
  case ISD::BITCAST:
4274
114k
    // Just drop all 128/256/512-bit bitcasts.
4275
114k
    if (NVT.is512BitVector() || 
NVT.is256BitVector()111k
||
NVT.is128BitVector()97.6k
||
4276
114k
        
NVT == MVT::f12811.9k
) {
4277
102k
      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
4278
102k
      CurDAG->RemoveDeadNode(Node);
4279
102k
      return;
4280
102k
    }
4281
11.9k
    break;
4282
11.9k
4283
14.9k
  case ISD::VSELECT: {
4284
14.9k
    // Replace VSELECT with non-mask conditions with with BLENDV.
4285
14.9k
    if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
4286
11.5k
      break;
4287
3.36k
4288
3.36k
    assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
4289
3.36k
    SDValue Blendv = CurDAG->getNode(
4290
3.36k
        X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
4291
3.36k
        Node->getOperand(1), Node->getOperand(2));
4292
3.36k
    ReplaceNode(Node, Blendv.getNode());
4293
3.36k
    SelectCode(Blendv.getNode());
4294
3.36k
    // We already called ReplaceUses.
4295
3.36k
    return;
4296
3.36k
  }
4297
3.36k
4298
24.7k
  case ISD::SRL:
4299
24.7k
    if (matchBitExtract(Node))
4300
170
      return;
4301
24.5k
    LLVM_FALLTHROUGH;
4302
43.2k
  case ISD::SRA:
4303
43.2k
  case ISD::SHL:
4304
43.2k
    if (tryShiftAmountMod(Node))
4305
681
      return;
4306
42.5k
    break;
4307
42.5k
4308
52.5k
  case ISD::AND:
4309
52.5k
    if (NVT.isVector() && 
NVT.getVectorElementType() == MVT::i118.8k
) {
4310
1.48k
      // Try to form a masked VPTESTM. Operands can be in either order.
4311
1.48k
      SDValue N0 = Node->getOperand(0);
4312
1.48k
      SDValue N1 = Node->getOperand(1);
4313
1.48k
      if (N0.getOpcode() == ISD::SETCC && 
N0.hasOneUse()925
&&
4314
1.48k
          
tryVPTESTM(Node, N0, N1)901
)
4315
110
        return;
4316
1.37k
      if (N1.getOpcode() == ISD::SETCC && 
N1.hasOneUse()306
&&
4317
1.37k
          
tryVPTESTM(Node, N1, N0)306
)
4318
42
        return;
4319
52.3k
    }
4320
52.3k
4321
52.3k
    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
4322
78
      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4323
78
      CurDAG->RemoveDeadNode(Node);
4324
78
      return;
4325
78
    }
4326
52.2k
    if (matchBitExtract(Node))
4327
472
      return;
4328
51.8k
    if (AndImmShrink && shrinkAndImmediate(Node))
4329
306
      return;
4330
51.4k
4331
51.4k
    LLVM_FALLTHROUGH;
4332
95.2k
  case ISD::OR:
4333
95.2k
  case ISD::XOR:
4334
95.2k
    if (tryShrinkShlLogicImm(Node))
4335
158
      return;
4336
95.0k
4337
95.0k
    LLVM_FALLTHROUGH;
4338
210k
  case ISD::ADD:
4339
210k
  case ISD::SUB: {
4340
210k
    // Try to avoid folding immediates with multiple uses for optsize.
4341
210k
    // This code tries to select to register form directly to avoid going
4342
210k
    // through the isel table which might fold the immediate. We can't change
4343
210k
    // the patterns on the add/sub/and/or/xor with immediate paterns in the
4344
210k
    // tablegen files to check immediate use count without making the patterns
4345
210k
    // unavailable to the fast-isel table.
4346
210k
    if (!OptForSize)
4347
207k
      break;
4348
3.33k
4349
3.33k
    // Only handle i8/i16/i32/i64.
4350
3.33k
    if (NVT != MVT::i8 && 
NVT != MVT::i163.02k
&&
NVT != MVT::i323.02k
&&
NVT != MVT::i64813
)
4351
328
      break;
4352
3.00k
4353
3.00k
    SDValue N0 = Node->getOperand(0);
4354
3.00k
    SDValue N1 = Node->getOperand(1);
4355
3.00k
4356
3.00k
    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
4357
3.00k
    if (!Cst)
4358
1.43k
      break;
4359
1.56k
4360
1.56k
    int64_t Val = Cst->getSExtValue();
4361
1.56k
4362
1.56k
    // Make sure its an immediate that is considered foldable.
4363
1.56k
    // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
4364
1.56k
    if (!isInt<8>(Val) && 
!isInt<32>(Val)862
)
4365
164
      break;
4366
1.40k
4367
1.40k
    // Check if we should avoid folding this immediate.
4368
1.40k
    if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
4369
1.02k
      break;
4370
378
4371
378
    // We should not fold the immediate. So we need a register form instead.
4372
378
    unsigned ROpc, MOpc;
4373
378
    switch (NVT.SimpleTy) {
4374
378
    
default: 0
llvm_unreachable0
("Unexpected VT!");
4375
378
    case MVT::i8:
4376
36
      switch (Opcode) {
4377
36
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
4378
36
      
case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break0
;
4379
36
      
case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break0
;
4380
36
      case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
4381
36
      
case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break0
;
4382
36
      
case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break0
;
4383
36
      }
4384
36
      break;
4385
36
    case MVT::i16:
4386
0
      switch (Opcode) {
4387
0
      default: llvm_unreachable("Unexpected opcode!");
4388
0
      case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
4389
0
      case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
4390
0
      case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
4391
0
      case ISD::OR:  ROpc = X86::OR16rr;  MOpc = X86::OR16rm;  break;
4392
0
      case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
4393
0
      }
4394
0
      break;
4395
338
    case MVT::i32:
4396
338
      switch (Opcode) {
4397
338
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
4398
338
      
case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break63
;
4399
338
      
case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break0
;
4400
338
      
case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break220
;
4401
338
      
case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break13
;
4402
338
      
case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break42
;
4403
338
      }
4404
338
      break;
4405
338
    case MVT::i64:
4406
4
      switch (Opcode) {
4407
4
      
default: 0
llvm_unreachable0
("Unexpected opcode!");
4408
4
      case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
4409
4
      
case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break0
;
4410
4
      
case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break0
;
4411
4
      
case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break0
;
4412
4
      
case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break0
;
4413
4
      }
4414
4
      break;
4415
378
    }
4416
378
4417
378
    // Ok this is a AND/OR/XOR/ADD/SUB with constant.
4418
378
4419
378
    // If this is a not a subtract, we can still try to fold a load.
4420
378
    if (Opcode != ISD::SUB) {
4421
378
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4422
378
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4423
8
        SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4424
8
        SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4425
8
        MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4426
8
        // Update the chain.
4427
8
        ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
4428
8
        // Record the mem-refs
4429
8
        CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
4430
8
        ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4431
8
        CurDAG->RemoveDeadNode(Node);
4432
8
        return;
4433
8
      }
4434
370
    }
4435
370
4436
370
    CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
4437
370
    return;
4438
370
  }
4439
370
4440
370
  case X86ISD::SMUL:
4441
92
    // i16/i32/i64 are handled with isel patterns.
4442
92
    if (NVT != MVT::i8)
4443
77
      break;
4444
15
    LLVM_FALLTHROUGH;
4445
140
  case X86ISD::UMUL: {
4446
140
    SDValue N0 = Node->getOperand(0);
4447
140
    SDValue N1 = Node->getOperand(1);
4448
140
4449
140
    unsigned LoReg, ROpc, MOpc;
4450
140
    switch (NVT.SimpleTy) {
4451
140
    
default: 0
llvm_unreachable0
("Unsupported VT!");
4452
140
    case MVT::i8:
4453
28
      LoReg = X86::AL;
4454
28
      ROpc = Opcode == X86ISD::SMUL ? 
X86::IMUL8r15
:
X86::MUL8r13
;
4455
28
      MOpc = Opcode == X86ISD::SMUL ? 
X86::IMUL8m15
:
X86::MUL8m13
;
4456
28
      break;
4457
140
    case MVT::i16:
4458
9
      LoReg = X86::AX;
4459
9
      ROpc = X86::MUL16r;
4460
9
      MOpc = X86::MUL16m;
4461
9
      break;
4462
140
    case MVT::i32:
4463
30
      LoReg = X86::EAX;
4464
30
      ROpc = X86::MUL32r;
4465
30
      MOpc = X86::MUL32m;
4466
30
      break;
4467
140
    case MVT::i64:
4468
73
      LoReg = X86::RAX;
4469
73
      ROpc = X86::MUL64r;
4470
73
      MOpc = X86::MUL64m;
4471
73
      break;
4472
140
    }
4473
140
4474
140
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4475
140
    bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4476
140
    // Multiply is commmutative.
4477
140
    if (!FoldedLoad) {
4478
129
      FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4479
129
      if (FoldedLoad)
4480
11
        std::swap(N0, N1);
4481
129
    }
4482
140
4483
140
    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
4484
140
                                          N0, SDValue()).getValue(1);
4485
140
4486
140
    MachineSDNode *CNode;
4487
140
    if (FoldedLoad) {
4488
22
      // i16/i32/i64 use an instruction that produces a low and high result even
4489
22
      // though only the low result is used.
4490
22
      SDVTList VTs;
4491
22
      if (NVT == MVT::i8)
4492
8
        VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
4493
14
      else
4494
14
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);
4495
22
4496
22
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4497
22
                        InFlag };
4498
22
      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4499
22
4500
22
      // Update the chain.
4501
22
      ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 
28
:
314
));
4502
22
      // Record the mem-refs
4503
22
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4504
118
    } else {
4505
118
      // i16/i32/i64 use an instruction that produces a low and high result even
4506
118
      // though only the low result is used.
4507
118
      SDVTList VTs;
4508
118
      if (NVT == MVT::i8)
4509
20
        VTs = CurDAG->getVTList(NVT, MVT::i32);
4510
98
      else
4511
98
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);
4512
118
4513
118
      CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
4514
118
    }
4515
140
4516
140
    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
4517
140
    ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 
128
:
2112
));
4518
140
    CurDAG->RemoveDeadNode(Node);
4519
140
    return;
4520
140
  }
4521
140
4522
1.83k
  case ISD::SMUL_LOHI:
4523
1.83k
  case ISD::UMUL_LOHI: {
4524
1.83k
    SDValue N0 = Node->getOperand(0);
4525
1.83k
    SDValue N1 = Node->getOperand(1);
4526
1.83k
4527
1.83k
    unsigned Opc, MOpc;
4528
1.83k
    bool isSigned = Opcode == ISD::SMUL_LOHI;
4529
1.83k
    if (!isSigned) {
4530
1.62k
      switch (NVT.SimpleTy) {
4531
1.62k
      
default: 0
llvm_unreachable0
("Unsupported VT!");
4532
1.62k
      
case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break1.08k
;
4533
1.62k
      
case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break535
;
4534
206
      }
4535
206
    } else {
4536
206
      switch (NVT.SimpleTy) {
4537
206
      
default: 0
llvm_unreachable0
("Unsupported VT!");
4538
206
      
case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break48
;
4539
206
      
case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break158
;
4540
1.83k
      }
4541
1.83k
    }
4542
1.83k
4543
1.83k
    unsigned SrcReg, LoReg, HiReg;
4544
1.83k
    switch (Opc) {
4545
1.83k
    
default: 0
llvm_unreachable0
("Unknown MUL opcode!");
4546
1.83k
    case X86::IMUL32r:
4547
1.13k
    case X86::MUL32r:
4548
1.13k
      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
4549
1.13k
      break;
4550
1.13k
    case X86::IMUL64r:
4551
693
    case X86::MUL64r:
4552
693
      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
4553
693
      break;
4554
1.83k
    }
4555
1.83k
4556
1.83k
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4557
1.83k
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4558
1.83k
    // Multiply is commmutative.
4559
1.83k
    if (!foldedLoad) {
4560
1.80k
      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4561
1.80k
      if (foldedLoad)
4562
121
        std::swap(N0, N1);
4563
1.80k
    }
4564
1.83k
4565
1.83k
    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
4566
1.83k
                                          N0, SDValue()).getValue(1);
4567
1.83k
    if (foldedLoad) {
4568
146
      SDValue Chain;
4569
146
      MachineSDNode *CNode = nullptr;
4570
146
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4571
146
                        InFlag };
4572
146
      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
4573
146
      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
4574
146
      Chain = SDValue(CNode, 0);
4575
146
      InFlag = SDValue(CNode, 1);
4576
146
4577
146
      // Update the chain.
4578
146
      ReplaceUses(N1.getValue(1), Chain);
4579
146
      // Record the mem-refs
4580
146
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4581
1.68k
    } else {
4582
1.68k
      SDValue Ops[] = { N1, InFlag };
4583
1.68k
      SDVTList VTs = CurDAG->getVTList(MVT::Glue);
4584
1.68k
      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
4585
1.68k
      InFlag = SDValue(CNode, 0);
4586
1.68k
    }
4587
1.83k
4588
1.83k
    // Copy the low half of the result, if it is needed.
4589
1.83k
    if (!SDValue(Node, 0).use_empty()) {
4590
1.29k
      assert(LoReg && "Register for low half is not defined!");
4591
1.29k
      SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
4592
1.29k
                                             NVT, InFlag);
4593
1.29k
      InFlag = ResLo.getValue(2);
4594
1.29k
      ReplaceUses(SDValue(Node, 0), ResLo);
4595
1.29k
      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
4596
1.29k
                 dbgs() << '\n');
4597
1.29k
    }
4598
1.83k
    // Copy the high half of the result, if it is needed.
4599
1.83k
    if (!SDValue(Node, 1).use_empty()) {
4600
1.83k
      assert(HiReg && "Register for high half is not defined!");
4601
1.83k
      SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
4602
1.83k
                                             NVT, InFlag);
4603
1.83k
      InFlag = ResHi.getValue(2);
4604
1.83k
      ReplaceUses(SDValue(Node, 1), ResHi);
4605
1.83k
      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
4606
1.83k
                 dbgs() << '\n');
4607
1.83k
    }
4608
1.83k
4609
1.83k
    CurDAG->RemoveDeadNode(Node);
4610
1.83k
    return;
4611
1.83k
  }
4612
1.83k
4613
1.83k
  case ISD::SDIVREM:
4614
1.76k
  case ISD::UDIVREM: {
4615
1.76k
    SDValue N0 = Node->getOperand(0);
4616
1.76k
    SDValue N1 = Node->getOperand(1);
4617
1.76k
4618
1.76k
    unsigned Opc, MOpc;
4619
1.76k
    bool isSigned = Opcode == ISD::SDIVREM;
4620
1.76k
    if (!isSigned) {
4621
1.43k
      switch (NVT.SimpleTy) {
4622
1.43k
      
default: 0
llvm_unreachable0
("Unsupported VT!");
4623
1.43k
      
case MVT::i8: Opc = X86::DIV8r; MOpc = X86::DIV8m; break39
;
4624
1.43k
      
case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break10
;
4625
1.43k
      
case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break736
;
4626
1.43k
      
case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break653
;
4627
322
      }
4628
322
    } else {
4629
322
      switch (NVT.SimpleTy) {
4630
322
      
default: 0
llvm_unreachable0
("Unsupported VT!");
4631
322
      
case MVT::i8: Opc = X86::IDIV8r; MOpc = X86::IDIV8m; break101
;
4632
322
      
case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break23
;
4633
322
      
case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break150
;
4634
322
      
case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break48
;
4635
1.76k
      }
4636
1.76k
    }
4637
1.76k
4638
1.76k
    unsigned LoReg, HiReg, ClrReg;
4639
1.76k
    unsigned SExtOpcode;
4640
1.76k
    switch (NVT.SimpleTy) {
4641
1.76k
    
default: 0
llvm_unreachable0
("Unsupported VT!");
4642
1.76k
    case MVT::i8:
4643
140
      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
4644
140
      SExtOpcode = X86::CBW;
4645
140
      break;
4646
1.76k
    case MVT::i16:
4647
33
      LoReg = X86::AX;  HiReg = X86::DX;
4648
33
      ClrReg = X86::DX;
4649
33
      SExtOpcode = X86::CWD;
4650
33
      break;
4651
1.76k
    case MVT::i32:
4652
886
      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
4653
886
      SExtOpcode = X86::CDQ;
4654
886
      break;
4655
1.76k
    case MVT::i64:
4656
701
      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
4657
701
      SExtOpcode = X86::CQO;
4658
701
      break;
4659
1.76k
    }
4660
1.76k
4661
1.76k
    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4662
1.76k
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
4663
1.76k
    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
4664
1.76k
4665
1.76k
    SDValue InFlag;
4666
1.76k
    if (NVT == MVT::i8 && 
(140
!isSigned140
||
signBitIsZero101
)) {
4667
41
      // Special case for div8, just use a move with zero extension to AX to
4668
41
      // clear the upper 8 bits (AH).
4669
41
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
4670
41
      MachineSDNode *Move;
4671
41
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4672
9
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
4673
9
        Move = CurDAG->getMachineNode(X86::MOVZX32rm8, dl, MVT::i32,
4674
9
                                      MVT::Other, Ops);
4675
9
        Chain = SDValue(Move, 1);
4676
9
        ReplaceUses(N0.getValue(1), Chain);
4677
9
        // Record the mem-refs
4678
9
        CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
4679
32
      } else {
4680
32
        Move = CurDAG->getMachineNode(X86::MOVZX32rr8, dl, MVT::i32, N0);
4681
32
        Chain = CurDAG->getEntryNode();
4682
32
      }
4683
41
      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::EAX, SDValue(Move, 0),
4684
41
                                    SDValue());
4685
41
      InFlag = Chain.getValue(1);
4686
1.71k
    } else {
4687
1.71k
      InFlag =
4688
1.71k
        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
4689
1.71k
                             LoReg, N0, SDValue()).getValue(1);
4690
1.71k
      if (isSigned && 
!signBitIsZero320
) {
4691
304
        // Sign extend the low part into the high part.
4692
304
        InFlag =
4693
304
          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
4694
1.41k
      } else {
4695
1.41k
        // Zero out the high part, effectively zero extending the input.
4696
1.41k
        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
4697
1.41k
        switch (NVT.SimpleTy) {
4698
1.41k
        case MVT::i16:
4699
15
          ClrNode =
4700
15
              SDValue(CurDAG->getMachineNode(
4701
15
                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
4702
15
                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
4703
15
                                                    MVT::i32)),
4704
15
                      0);
4705
15
          break;
4706
1.41k
        case MVT::i32:
4707
747
          break;
4708
1.41k
        case MVT::i64:
4709
653
          ClrNode =
4710
653
              SDValue(CurDAG->getMachineNode(
4711
653
                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
4712
653
                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
4713
653
                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
4714
653
                                                    MVT::i32)),
4715
653
                      0);
4716
653
          break;
4717
1.41k
        default:
4718
0
          llvm_unreachable("Unexpected division source");
4719
1.41k
        }
4720
1.41k
4721
1.41k
        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
4722
1.41k
                                      ClrNode, InFlag).getValue(1);
4723
1.41k
      }
4724
1.71k
    }
4725
1.76k
4726
1.76k
    if (foldedLoad) {
4727
92
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
4728
92
                        InFlag };
4729
92
      MachineSDNode *CNode =
4730
92
        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
4731
92
      InFlag = SDValue(CNode, 1);
4732
92
      // Update the chain.
4733
92
      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
4734
92
      // Record the mem-refs
4735
92
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
4736
1.66k
    } else {
4737
1.66k
      InFlag =
4738
1.66k
        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
4739
1.66k
    }
4740
1.76k
4741
1.76k
    // Prevent use of AH in a REX instruction by explicitly copying it to
4742
1.76k
    // an ABCD_L register.
4743
1.76k
    //
4744
1.76k
    // The current assumption of the register allocator is that isel
4745
1.76k
    // won't generate explicit references to the GR8_ABCD_H registers. If
4746
1.76k
    // the allocator and/or the backend get enhanced to be more robust in
4747
1.76k
    // that regard, this can be, and should be, removed.
4748
1.76k
    if (HiReg == X86::AH && 
!SDValue(Node, 1).use_empty()140
) {
4749
118
      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
4750
118
      unsigned AHExtOpcode =
4751
118
          isSigned ? 
X86::MOVSX32rr8_NOREX95
:
X86::MOVZX32rr8_NOREX23
;
4752
118
4753
118
      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
4754
118
                                             MVT::Glue, AHCopy, InFlag);
4755
118
      SDValue Result(RNode, 0);
4756
118
      InFlag = SDValue(RNode, 1);
4757
118
4758
118
      Result =
4759
118
          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
4760
118
4761
118
      ReplaceUses(SDValue(Node, 1), Result);
4762
118
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4763
118
                 dbgs() << '\n');
4764
118
    }
4765
1.76k
    // Copy the division (low) result, if it is needed.
4766
1.76k
    if (!SDValue(Node, 0).use_empty()) {
4767
833
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4768
833
                                                LoReg, NVT, InFlag);
4769
833
      InFlag = Result.getValue(2);
4770
833
      ReplaceUses(SDValue(Node, 0), Result);
4771
833
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4772
833
                 dbgs() << '\n');
4773
833
    }
4774
1.76k
    // Copy the remainder (high) result, if it is needed.
4775
1.76k
    if (!SDValue(Node, 1).use_empty()) {
4776
868
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
4777
868
                                              HiReg, NVT, InFlag);
4778
868
      InFlag = Result.getValue(2);
4779
868
      ReplaceUses(SDValue(Node, 1), Result);
4780
868
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
4781
868
                 dbgs() << '\n');
4782
868
    }
4783
1.76k
    CurDAG->RemoveDeadNode(Node);
4784
1.76k
    return;
4785
1.76k
  }
4786
1.76k
4787
95.8k
  case X86ISD::CMP: {
4788
95.8k
    SDValue N0 = Node->getOperand(0);
4789
95.8k
    SDValue N1 = Node->getOperand(1);
4790
95.8k
4791
95.8k
    // Optimizations for TEST compares.
4792
95.8k
    if (!isNullConstant(N1))
4793
1.81k
      break;
4794
94.0k
4795
94.0k
    // Save the original VT of the compare.
4796
94.0k
    MVT CmpVT = N0.getSimpleValueType();
4797
94.0k
4798
94.0k
    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
4799
94.0k
    // by a test instruction. The test should be removed later by
4800
94.0k
    // analyzeCompare if we are using only the zero flag.
4801
94.0k
    // TODO: Should we check the users and use the BEXTR flags directly?
4802
94.0k
    if (N0.getOpcode() == ISD::AND && 
N0.hasOneUse()16.4k
) {
4803
16.3k
      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
4804
2
        unsigned TestOpc = CmpVT == MVT::i64 ? 
X86::TEST64rr0
4805
2
                                             : X86::TEST32rr;
4806
2
        SDValue BEXTR = SDValue(NewNode, 0);
4807
2
        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
4808
2
        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
4809
2
        CurDAG->RemoveDeadNode(Node);
4810
2
        return;
4811
2
      }
4812
93.9k
    }
4813
93.9k
4814
93.9k
    // We can peek through truncates, but we need to be careful below.
4815
93.9k
    if (N0.getOpcode() == ISD::TRUNCATE && 
N0.hasOneUse()771
)
4816
627
      N0 = N0.getOperand(0);
4817
93.9k
4818
93.9k
    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
4819
93.9k
    // use a smaller encoding.
4820
93.9k
    // Look past the truncate if CMP is the only use of it.
4821
93.9k
    if (N0.getOpcode() == ISD::AND &&
4822
93.9k
        
N0.getNode()->hasOneUse()16.5k
&&
4823
93.9k
        
N0.getValueType() != MVT::i816.3k
) {
4824
6.07k
      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
4825
6.07k
      if (!C) 
break3.02k
;
4826
3.04k
      uint64_t Mask = C->getZExtValue();
4827
3.04k
4828
3.04k
      // Check if we can replace AND+IMM64 with a shift. This is possible for
4829
3.04k
      // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
4830
3.04k
      // flag.
4831
3.04k
      if (CmpVT == MVT::i64 && 
!isInt<32>(Mask)2.08k
&&
4832
3.04k
          
onlyUsesZeroFlag(SDValue(Node, 0))1.32k
) {
4833
1.32k
        if (isMask_64(~Mask)) {
4834
185
          unsigned TrailingZeros = countTrailingZeros(Mask);
4835
185
          SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
4836
185
          SDValue Shift =
4837
185
            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
4838
185
                                           N0.getOperand(0), Imm), 0);
4839
185
          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4840
185
                                                       MVT::i32, Shift, Shift);
4841
185
          ReplaceNode(Node, Test);
4842
185
          return;
4843
185
        }
4844
1.14k
        if (isMask_64(Mask)) {
4845
4
          unsigned LeadingZeros = countLeadingZeros(Mask);
4846
4
          SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
4847
4
          SDValue Shift =
4848
4
            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
4849
4
                                           N0.getOperand(0), Imm), 0);
4850
4
          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
4851
4
                                                       MVT::i32, Shift, Shift);
4852
4
          ReplaceNode(Node, Test);
4853
4
          return;
4854
4
        }
4855
2.85k
      }
4856
2.85k
4857
2.85k
      MVT VT;
4858
2.85k
      int SubRegOp;
4859
2.85k
      unsigned ROpc, MOpc;
4860
2.85k
4861
2.85k
      // For each of these checks we need to be careful if the sign flag is
4862
2.85k
      // being used. It is only safe to use the sign flag in two conditions,
4863
2.85k
      // either the sign bit in the shrunken mask is zero or the final test
4864
2.85k
      // size is equal to the original compare size.
4865
2.85k
4866
2.85k
      if (isUInt<8>(Mask) &&
4867
2.85k
          
(813
!(Mask & 0x80)813
||
CmpVT == MVT::i815
||
4868
813
           
hasNoSignFlagUses(SDValue(Node, 0))15
)) {
4869
811
        // For example, convert "testl %eax, $8" to "testb %al, $8"
4870
811
        VT = MVT::i8;
4871
811
        SubRegOp = X86::sub_8bit;
4872
811
        ROpc = X86::TEST8ri;
4873
811
        MOpc = X86::TEST8mi;
4874
2.04k
      } else if (OptForMinSize && 
isUInt<16>(Mask)21
&&
4875
2.04k
                 
(19
!(Mask & 0x8000)19
||
CmpVT == MVT::i1616
||
4876
19
                  
hasNoSignFlagUses(SDValue(Node, 0))10
)) {
4877
19
        // For example, "testl %eax, $32776" to "testw %ax, $32776".
4878
19
        // NOTE: We only want to form TESTW instructions if optimizing for
4879
19
        // min size. Otherwise we only save one byte and possibly get a length
4880
19
        // changing prefix penalty in the decoders.
4881
19
        VT = MVT::i16;
4882
19
        SubRegOp = X86::sub_16bit;
4883
19
        ROpc = X86::TEST16ri;
4884
19
        MOpc = X86::TEST16mi;
4885
2.02k
      } else if (isUInt<32>(Mask) && 
N0.getValueType() != MVT::i16877
&&
4886
2.02k
                 
(877
(877
!(Mask & 0x80000000)877
&&
4887
877
                   // Without minsize 16-bit Cmps can get here so we need to
4888
877
                   // be sure we calculate the correct sign flag if needed.
4889
877
                   
(825
CmpVT != MVT::i16825
||
!(Mask & 0x8000)21
)) ||
4890
877
                  
CmpVT == MVT::i3258
||
4891
877
                  
hasNoSignFlagUses(SDValue(Node, 0))10
)) {
4892
871
        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
4893
871
        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
4894
871
        // Otherwize, we find ourselves in a position where we have to do
4895
871
        // promotion. If previous passes did not promote the and, we assume
4896
871
        // they had a good reason not to and do not promote here.
4897
871
        VT = MVT::i32;
4898
871
        SubRegOp = X86::sub_32bit;
4899
871
        ROpc = X86::TEST32ri;
4900
871
        MOpc = X86::TEST32mi;
4901
1.15k
      } else {
4902
1.15k
        // No eligible transformation was found.
4903
1.15k
        break;
4904
1.15k
      }
4905
1.70k
4906
1.70k
      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
4907
1.70k
      SDValue Reg = N0.getOperand(0);
4908
1.70k
4909
1.70k
      // Emit a testl or testw.
4910
1.70k
      MachineSDNode *NewNode;
4911
1.70k
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
4912
1.70k
      if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
4913
60
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
4914
60
                          Reg.getOperand(0) };
4915
60
        NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
4916
60
        // Update the chain.
4917
60
        ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
4918
60
        // Record the mem-refs
4919
60
        CurDAG->setNodeMemRefs(NewNode,
4920
60
                               {cast<LoadSDNode>(Reg)->getMemOperand()});
4921
1.64k
      } else {
4922
1.64k
        // Extract the subregister if necessary.
4923
1.64k
        if (N0.getValueType() != VT)
4924
1.36k
          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);
4925
1.64k
4926
1.64k
        NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
4927
1.64k
      }
4928
1.70k
      // Replace CMP with TEST.
4929
1.70k
      ReplaceNode(Node, NewNode);
4930
1.70k
      return;
4931
1.70k
    }
4932
87.9k
    break;
4933
87.9k
  }
4934
87.9k
  case X86ISD::PCMPISTR: {
4935
134
    if (!Subtarget->hasSSE42())
4936
0
      break;
4937
134
4938
134
    bool NeedIndex = !SDValue(Node, 0).use_empty();
4939
134
    bool NeedMask = !SDValue(Node, 1).use_empty();
4940
134
    // We can't fold a load if we are going to make two instructions.
4941
134
    bool MayFoldLoad = !NeedIndex || 
!NeedMask44
;
4942
134
4943
134
    MachineSDNode *CNode;
4944
134
    if (NeedMask) {
4945
28
      unsigned ROpc = Subtarget->hasAVX() ? 
X86::VPCMPISTRMrr13
:
X86::PCMPISTRMrr15
;
4946
28
      unsigned MOpc = Subtarget->hasAVX() ? 
X86::VPCMPISTRMrm13
:
X86::PCMPISTRMrm15
;
4947
28
      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
4948
28
      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
4949
28
    }
4950
134
    if (NeedIndex || 
!NeedMask