Coverage Report

Created: 2019-07-24 05:18

/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
Line
Count
Source (jump to first uncovered line)
1
//===-- SystemZTargetTransformInfo.cpp - SystemZ-specific TTI -------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements a TargetTransformInfo analysis pass specific to the
10
// SystemZ target machine. It uses the target's detailed information to provide
11
// more precise answers to certain TTI queries, while letting the target
12
// independent and default TTI implementations handle the rest.
13
//
14
//===----------------------------------------------------------------------===//
15
16
#include "SystemZTargetTransformInfo.h"
17
#include "llvm/Analysis/TargetTransformInfo.h"
18
#include "llvm/CodeGen/BasicTTIImpl.h"
19
#include "llvm/CodeGen/CostTable.h"
20
#include "llvm/CodeGen/TargetLowering.h"
21
#include "llvm/IR/IntrinsicInst.h"
22
#include "llvm/Support/Debug.h"
23
using namespace llvm;
24
25
#define DEBUG_TYPE "systemztti"
26
27
//===----------------------------------------------------------------------===//
28
//
29
// SystemZ cost model.
30
//
31
//===----------------------------------------------------------------------===//
32
33
945
int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
34
945
  assert(Ty->isIntegerTy());
35
945
36
945
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
37
945
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
38
945
  // here, so that constant hoisting will ignore this constant.
39
945
  if (BitSize == 0)
40
0
    return TTI::TCC_Free;
41
945
  // No cost model for operations on integers larger than 64 bit implemented yet.
42
945
  if (BitSize > 64)
43
0
    return TTI::TCC_Free;
44
945
45
945
  if (Imm == 0)
46
228
    return TTI::TCC_Free;
47
717
48
717
  if (Imm.getBitWidth() <= 64) {
49
717
    // Constants loaded via lgfi.
50
717
    if (isInt<32>(Imm.getSExtValue()))
51
645
      return TTI::TCC_Basic;
52
72
    // Constants loaded via llilf.
53
72
    if (isUInt<32>(Imm.getZExtValue()))
54
11
      return TTI::TCC_Basic;
55
61
    // Constants loaded via llihf:
56
61
    if ((Imm.getZExtValue() & 0xffffffff) == 0)
57
30
      return TTI::TCC_Basic;
58
31
59
31
    return 2 * TTI::TCC_Basic;
60
31
  }
61
0
62
0
  return 4 * TTI::TCC_Basic;
63
0
}
64
65
int SystemZTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx,
66
8.66k
                                  const APInt &Imm, Type *Ty) {
67
8.66k
  assert(Ty->isIntegerTy());
68
8.66k
69
8.66k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
70
8.66k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
71
8.66k
  // here, so that constant hoisting will ignore this constant.
72
8.66k
  if (BitSize == 0)
73
0
    return TTI::TCC_Free;
74
8.66k
  // No cost model for operations on integers larger than 64 bit implemented yet.
75
8.66k
  if (BitSize > 64)
76
111
    return TTI::TCC_Free;
77
8.55k
78
8.55k
  switch (Opcode) {
79
8.55k
  default:
80
1.29k
    return TTI::TCC_Free;
81
8.55k
  case Instruction::GetElementPtr:
82
2.84k
    // Always hoist the base address of a GetElementPtr. This prevents the
83
2.84k
    // creation of new constants for every base constant that gets constant
84
2.84k
    // folded with the offset.
85
2.84k
    if (Idx == 0)
86
0
      return 2 * TTI::TCC_Basic;
87
2.84k
    return TTI::TCC_Free;
88
2.84k
  case Instruction::Store:
89
396
    if (Idx == 0 && Imm.getBitWidth() <= 64) {
90
396
      // Any 8-bit immediate store can by implemented via mvi.
91
396
      if (BitSize == 8)
92
102
        return TTI::TCC_Free;
93
294
      // 16-bit immediate values can be stored via mvhhi/mvhi/mvghi.
94
294
      if (isInt<16>(Imm.getSExtValue()))
95
288
        return TTI::TCC_Free;
96
6
    }
97
6
    break;
98
1.41k
  case Instruction::ICmp:
99
1.41k
    if (Idx == 1 && 
Imm.getBitWidth() <= 641.40k
) {
100
1.40k
      // Comparisons against signed 32-bit immediates implemented via cgfi.
101
1.40k
      if (isInt<32>(Imm.getSExtValue()))
102
1.37k
        return TTI::TCC_Free;
103
25
      // Comparisons against unsigned 32-bit immediates implemented via clgfi.
104
25
      if (isUInt<32>(Imm.getZExtValue()))
105
6
        return TTI::TCC_Free;
106
30
    }
107
30
    break;
108
827
  case Instruction::Add:
109
827
  case Instruction::Sub:
110
827
    if (Idx == 1 && 
Imm.getBitWidth() <= 64728
) {
111
728
      // We use algfi/slgfi to add/subtract 32-bit unsigned immediates.
112
728
      if (isUInt<32>(Imm.getZExtValue()))
113
667
        return TTI::TCC_Free;
114
61
      // Or their negation, by swapping addition vs. subtraction.
115
61
      if (isUInt<32>(-Imm.getSExtValue()))
116
59
        return TTI::TCC_Free;
117
101
    }
118
101
    break;
119
101
  case Instruction::Mul:
120
35
    if (Idx == 1 && 
Imm.getBitWidth() <= 6434
) {
121
34
      // We use msgfi to multiply by 32-bit signed immediates.
122
34
      if (isInt<32>(Imm.getSExtValue()))
123
31
        return TTI::TCC_Free;
124
4
    }
125
4
    break;
126
191
  case Instruction::Or:
127
191
  case Instruction::Xor:
128
191
    if (Idx == 1 && 
Imm.getBitWidth() <= 64183
) {
129
183
      // Masks supported by oilf/xilf.
130
183
      if (isUInt<32>(Imm.getZExtValue()))
131
135
        return TTI::TCC_Free;
132
48
      // Masks supported by oihf/xihf.
133
48
      if ((Imm.getZExtValue() & 0xffffffff) == 0)
134
21
        return TTI::TCC_Free;
135
35
    }
136
35
    break;
137
499
  case Instruction::And:
138
499
    if (Idx == 1 && Imm.getBitWidth() <= 64) {
139
499
      // Any 32-bit AND operation can by implemented via nilf.
140
499
      if (BitSize <= 32)
141
279
        return TTI::TCC_Free;
142
220
      // 64-bit masks supported by nilf.
143
220
      if (isUInt<32>(~Imm.getZExtValue()))
144
81
        return TTI::TCC_Free;
145
139
      // 64-bit masks supported by nilh.
146
139
      if ((Imm.getZExtValue() & 0xffffffff) == 0xffffffff)
147
33
        return TTI::TCC_Free;
148
106
      // Some 64-bit AND operations can be implemented via risbg.
149
106
      const SystemZInstrInfo *TII = ST->getInstrInfo();
150
106
      unsigned Start, End;
151
106
      if (TII->isRxSBGMask(Imm.getZExtValue(), BitSize, Start, End))
152
102
        return TTI::TCC_Free;
153
4
    }
154
4
    break;
155
299
  case Instruction::Shl:
156
299
  case Instruction::LShr:
157
299
  case Instruction::AShr:
158
299
    // Always return TCC_Free for the shift value of a shift instruction.
159
299
    if (Idx == 1)
160
298
      return TTI::TCC_Free;
161
1
    break;
162
760
  case Instruction::UDiv:
163
760
  case Instruction::SDiv:
164
760
  case Instruction::URem:
165
760
  case Instruction::SRem:
166
760
  case Instruction::Trunc:
167
760
  case Instruction::ZExt:
168
760
  case Instruction::SExt:
169
760
  case Instruction::IntToPtr:
170
760
  case Instruction::PtrToInt:
171
760
  case Instruction::BitCast:
172
760
  case Instruction::PHI:
173
760
  case Instruction::Call:
174
760
  case Instruction::Select:
175
760
  case Instruction::Ret:
176
760
  case Instruction::Load:
177
760
    break;
178
941
  }
179
941
180
941
  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
181
941
}
182
183
int SystemZTTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
184
1.63k
                                  const APInt &Imm, Type *Ty) {
185
1.63k
  assert(Ty->isIntegerTy());
186
1.63k
187
1.63k
  unsigned BitSize = Ty->getPrimitiveSizeInBits();
188
1.63k
  // There is no cost model for constants with a bit size of 0. Return TCC_Free
189
1.63k
  // here, so that constant hoisting will ignore this constant.
190
1.63k
  if (BitSize == 0)
191
0
    return TTI::TCC_Free;
192
1.63k
  // No cost model for operations on integers larger than 64 bit implemented yet.
193
1.63k
  if (BitSize > 64)
194
0
    return TTI::TCC_Free;
195
1.63k
196
1.63k
  switch (IID) {
197
1.63k
  default:
198
977
    return TTI::TCC_Free;
199
1.63k
  case Intrinsic::sadd_with_overflow:
200
488
  case Intrinsic::uadd_with_overflow:
201
488
  case Intrinsic::ssub_with_overflow:
202
488
  case Intrinsic::usub_with_overflow:
203
488
    // These get expanded to include a normal addition/subtraction.
204
488
    if (Idx == 1 && Imm.getBitWidth() <= 64) {
205
488
      if (isUInt<32>(Imm.getZExtValue()))
206
412
        return TTI::TCC_Free;
207
76
      if (isUInt<32>(-Imm.getSExtValue()))
208
72
        return TTI::TCC_Free;
209
4
    }
210
4
    break;
211
4
  case Intrinsic::smul_with_overflow:
212
0
  case Intrinsic::umul_with_overflow:
213
0
    // These get expanded to include a normal multiplication.
214
0
    if (Idx == 1 && Imm.getBitWidth() <= 64) {
215
0
      if (isInt<32>(Imm.getSExtValue()))
216
0
        return TTI::TCC_Free;
217
0
    }
218
0
    break;
219
57
  case Intrinsic::experimental_stackmap:
220
57
    if ((Idx < 2) || 
(1
Imm.getBitWidth() <= 641
&&
isInt<64>(Imm.getSExtValue())1
))
221
57
      return TTI::TCC_Free;
222
0
    break;
223
110
  case Intrinsic::experimental_patchpoint_void:
224
110
  case Intrinsic::experimental_patchpoint_i64:
225
110
    if ((Idx < 4) || 
(15
Imm.getBitWidth() <= 6415
&&
isInt<64>(Imm.getSExtValue())15
))
226
110
      return TTI::TCC_Free;
227
0
    break;
228
4
  }
229
4
  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
230
4
}
231
232
TargetTransformInfo::PopcntSupportKind
233
1
SystemZTTIImpl::getPopcntSupport(unsigned TyWidth) {
234
1
  assert(isPowerOf2_32(TyWidth) && "Type width must be power of 2");
235
1
  if (ST->hasPopulationCount() && TyWidth <= 64)
236
1
    return TTI::PSK_FastHardware;
237
0
  return TTI::PSK_Software;
238
0
}
239
240
void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
241
2
                                             TTI::UnrollingPreferences &UP) {
242
2
  // Find out if L contains a call, what the machine instruction count
243
2
  // estimate is, and how many stores there are.
244
2
  bool HasCall = false;
245
2
  unsigned NumStores = 0;
246
2
  for (auto &BB : L->blocks())
247
10
    
for (auto &I : *BB)2
{
248
10
      if (isa<CallInst>(&I) || 
isa<InvokeInst>(&I)8
) {
249
2
        ImmutableCallSite CS(&I);
250
2
        if (const Function *F = CS.getCalledFunction()) {
251
2
          if (isLoweredToCall(F))
252
0
            HasCall = true;
253
2
          if (F->getIntrinsicID() == Intrinsic::memcpy ||
254
2
              F->getIntrinsicID() == Intrinsic::memset)
255
0
            NumStores++;
256
2
        } else { // indirect call.
257
0
          HasCall = true;
258
0
        }
259
2
      }
260
10
      if (isa<StoreInst>(&I)) {
261
0
        Type *MemAccessTy = I.getOperand(0)->getType();
262
0
        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
263
0
      }
264
10
    }
265
2
266
2
  // The z13 processor will run out of store tags if too many stores
267
2
  // are fed into it too quickly. Therefore make sure there are not
268
2
  // too many stores in the resulting unrolled loop.
269
2
  unsigned const Max = (NumStores ? 
(12 / NumStores)0
: UINT_MAX);
270
2
271
2
  if (HasCall) {
272
0
    // Only allow full unrolling if loop has any calls.
273
0
    UP.FullUnrollMaxCount = Max;
274
0
    UP.MaxCount = 1;
275
0
    return;
276
0
  }
277
2
278
2
  UP.MaxCount = Max;
279
2
  if (UP.MaxCount <= 1)
280
0
    return;
281
2
282
2
  // Allow partial and runtime trip count unrolling.
283
2
  UP.Partial = UP.Runtime = true;
284
2
285
2
  UP.PartialThreshold = 75;
286
2
  UP.DefaultUnrollRuntimeCount = 4;
287
2
288
2
  // Allow expensive instructions in the pre-header of the loop.
289
2
  UP.AllowExpensiveTripCount = true;
290
2
291
2
  UP.Force = true;
292
2
}
293
294
295
bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
296
429
                                   TargetTransformInfo::LSRCost &C2) {
297
429
  // SystemZ specific: check instruction count (first), and don't care about
298
429
  // ImmCost, since offsets are checked explicitly.
299
429
  return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
300
429
                  C1.NumIVMuls, C1.NumBaseAdds,
301
429
                  C1.ScaleCost, C1.SetupCost) <
302
429
    std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
303
429
             C2.NumIVMuls, C2.NumBaseAdds,
304
429
             C2.ScaleCost, C2.SetupCost);
305
429
}
306
307
883
unsigned SystemZTTIImpl::getNumberOfRegisters(bool Vector) {
308
883
  if (!Vector)
309
809
    // Discount the stack pointer.  Also leave out %r0, since it can't
310
809
    // be used in an address.
311
809
    return 14;
312
74
  if (ST->hasVector())
313
56
    return 32;
314
18
  return 0;
315
18
}
316
317
10
unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
318
10
  if (!Vector)
319
0
    return 64;
320
10
  if (ST->hasVector())
321
10
    return 128;
322
0
  return 0;
323
0
}
324
325
0
bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
326
0
  EVT VT = TLI->getValueType(DL, DataType);
327
0
  return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
328
0
}
329
330
// Return the bit size for the scalar type or vector element
331
// type. getScalarSizeInBits() returns 0 for a pointer type.
332
5.09k
static unsigned getScalarSizeInBits(Type *Ty) {
333
5.09k
  unsigned Size =
334
5.09k
    (Ty->isPtrOrPtrVectorTy() ? 
64U3
:
Ty->getScalarSizeInBits()5.09k
);
335
5.09k
  assert(Size > 0 && "Element must have non-zero size.");
336
5.09k
  return Size;
337
5.09k
}
338
339
// getNumberOfParts() calls getTypeLegalizationCost() which splits the vector
340
// type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
341
// 3.
342
2.63k
static unsigned getNumVectorRegs(Type *Ty) {
343
2.63k
  assert(Ty->isVectorTy() && "Expected vector type");
344
2.63k
  unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
345
2.63k
  assert(WideBits > 0 && "Could not compute size of vector");
346
2.63k
  return ((WideBits % 128U) ? 
((WideBits / 128U) + 1)854
:
(WideBits / 128U)1.78k
);
347
2.63k
}
348
349
int SystemZTTIImpl::getArithmeticInstrCost(
350
    unsigned Opcode, Type *Ty,
351
    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
352
    TTI::OperandValueProperties Opd1PropInfo,
353
    TTI::OperandValueProperties Opd2PropInfo,
354
900
    ArrayRef<const Value *> Args) {
355
900
356
900
  // TODO: return a good value for BB-VECTORIZER that includes the
357
900
  // immediate loads, which we do not want to count for the loop
358
900
  // vectorizer, since they are hopefully hoisted out of the loop. This
359
900
  // would require a new parameter 'InLoop', but not sure if constant
360
900
  // args are common enough to motivate this.
361
900
362
900
  unsigned ScalarBits = Ty->getScalarSizeInBits();
363
900
364
900
  // There are thre cases of division and remainder: Dividing with a register
365
900
  // needs a divide instruction. A divisor which is a power of two constant
366
900
  // can be implemented with a sequence of shifts. Any other constant needs a
367
900
  // multiply and shifts.
368
900
  const unsigned DivInstrCost = 20;
369
900
  const unsigned DivMulSeqCost = 10;
370
900
  const unsigned SDivPow2Cost = 4;
371
900
372
900
  bool SignedDivRem =
373
900
      Opcode == Instruction::SDiv || 
Opcode == Instruction::SRem831
;
374
900
  bool UnsignedDivRem =
375
900
      Opcode == Instruction::UDiv || 
Opcode == Instruction::URem843
;
376
900
377
900
  // Check for a constant divisor.
378
900
  bool DivRemConst = false;
379
900
  bool DivRemConstPow2 = false;
380
900
  if ((SignedDivRem || 
UnsignedDivRem786
) &&
Args.size() == 2208
) {
381
192
    if (const Constant *C = dyn_cast<Constant>(Args[1])) {
382
114
      const ConstantInt *CVal =
383
114
          (C->getType()->isVectorTy()
384
114
               ? 
dyn_cast_or_null<const ConstantInt>(C->getSplatValue())64
385
114
               : 
dyn_cast<const ConstantInt>(C)50
);
386
114
      if (CVal != nullptr &&
387
114
          
(90
CVal->getValue().isPowerOf2()90
||
(-CVal->getValue()).isPowerOf2()44
))
388
60
        DivRemConstPow2 = true;
389
54
      else
390
54
        DivRemConst = true;
391
114
    }
392
192
  }
393
900
394
900
  if (Ty->isVectorTy()) {
395
303
    assert(ST->hasVector() &&
396
303
           "getArithmeticInstrCost() called with vector type.");
397
303
    unsigned VF = Ty->getVectorNumElements();
398
303
    unsigned NumVectors = getNumVectorRegs(Ty);
399
303
400
303
    // These vector operations are custom handled, but are still supported
401
303
    // with one instruction per vector, regardless of element size.
402
303
    if (Opcode == Instruction::Shl || 
Opcode == Instruction::LShr287
||
403
303
        
Opcode == Instruction::AShr271
) {
404
49
      return NumVectors;
405
49
    }
406
254
407
254
    if (DivRemConstPow2)
408
36
      return (NumVectors * (SignedDivRem ? 
SDivPow2Cost22
:
114
));
409
218
    if (DivRemConst)
410
28
      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
411
190
    if ((SignedDivRem || 
UnsignedDivRem176
) &&
VF > 428
)
412
12
      // Temporary hack: disable high vectorization factors with integer
413
12
      // division/remainder, which will get scalarized and handled with
414
12
      // GR128 registers. The mischeduler is not clever enough to avoid
415
12
      // spilling yet.
416
12
      return 1000;
417
178
418
178
    // These FP operations are supported with a single vector instruction for
419
178
    // double (base implementation assumes float generally costs 2). For
420
178
    // FP128, the scalar cost is 1, and there is no overhead since the values
421
178
    // are already in scalar registers.
422
178
    if (Opcode == Instruction::FAdd || 
Opcode == Instruction::FSub161
||
423
178
        
Opcode == Instruction::FMul145
||
Opcode == Instruction::FDiv129
) {
424
65
      switch (ScalarBits) {
425
65
      case 32: {
426
32
        // The vector enhancements facility 1 provides v4f32 instructions.
427
32
        if (ST->hasVectorEnhancements1())
428
16
          return NumVectors;
429
16
        // Return the cost of multiple scalar invocation plus the cost of
430
16
        // inserting and extracting the values.
431
16
        unsigned ScalarCost =
432
16
            getArithmeticInstrCost(Opcode, Ty->getScalarType());
433
16
        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
434
16
        // FIXME: VF 2 for these FP operations are currently just as
435
16
        // expensive as for VF 4.
436
16
        if (VF == 2)
437
4
          Cost *= 2;
438
16
        return Cost;
439
16
      }
440
33
      case 64:
441
33
      case 128:
442
33
        return NumVectors;
443
33
      default:
444
0
        break;
445
113
      }
446
113
    }
447
113
448
113
    // There is no native support for FRem.
449
113
    if (Opcode == Instruction::FRem) {
450
0
      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
451
0
      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
452
0
      if (VF == 2 && ScalarBits == 32)
453
0
        Cost *= 2;
454
0
      return Cost;
455
0
    }
456
597
  }
457
597
  else {  // Scalar:
458
597
    // These FP operations are supported with a dedicated instruction for
459
597
    // float, double and fp128 (base implementation assumes float generally
460
597
    // costs 2).
461
597
    if (Opcode == Instruction::FAdd || 
Opcode == Instruction::FSub587
||
462
597
        
Opcode == Instruction::FMul577
||
Opcode == Instruction::FDiv567
)
463
40
      return 1;
464
557
465
557
    // There is no native support for FRem.
466
557
    if (Opcode == Instruction::FRem)
467
0
      return LIBCALL_COST;
468
557
469
557
    // Give discount for some combined logical operations if supported.
470
557
    if (Args.size() == 2 && 
ST->hasMiscellaneousExtensions3()537
) {
471
20
      if (Opcode == Instruction::Xor) {
472
18
        for (const Value *A : Args) {
473
18
          if (const Instruction *I = dyn_cast<Instruction>(A))
474
12
            if (I->hasOneUse() &&
475
12
                (I->getOpcode() == Instruction::And ||
476
12
                 
I->getOpcode() == Instruction::Or10
||
477
12
                 
I->getOpcode() == Instruction::Xor8
))
478
6
              return 0;
479
18
        }
480
12
      }
481
8
      else if (Opcode == Instruction::Or || 
Opcode == Instruction::And4
) {
482
16
        for (const Value *A : Args) {
483
16
          if (const Instruction *I = dyn_cast<Instruction>(A))
484
8
            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
485
4
              return 0;
486
16
        }
487
8
      }
488
20
    }
489
557
490
557
    // Or requires one instruction, although it has custom handling for i64.
491
557
    
if (547
Opcode == Instruction::Or547
)
492
94
      return 1;
493
453
494
453
    if (Opcode == Instruction::Xor && 
ScalarBits == 1106
) {
495
72
      if (ST->hasLoadStoreOnCond2())
496
36
        return 5; // 2 * (li 0; loc 1); xor
497
36
      return 7; // 2 * ipm sequences ; xor ; shift ; compare
498
36
    }
499
381
500
381
    if (DivRemConstPow2)
501
24
      return (SignedDivRem ? 
SDivPow2Cost16
:
18
);
502
357
    if (DivRemConst)
503
26
      return DivMulSeqCost;
504
331
    if (SignedDivRem || 
UnsignedDivRem297
)
505
66
      return DivInstrCost;
506
378
  }
507
378
508
378
  // Fallback to the default implementation.
509
378
  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
510
378
                                       Opd1PropInfo, Opd2PropInfo, Args);
511
378
}
512
513
int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
514
40
                                   Type *SubTp) {
515
40
  assert (Tp->isVectorTy());
516
40
  assert (ST->hasVector() && "getShuffleCost() called.");
517
40
  unsigned NumVectors = getNumVectorRegs(Tp);
518
40
519
40
  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
520
40
521
40
  // FP128 values are always in scalar registers, so there is no work
522
40
  // involved with a shuffle, except for broadcast. In that case register
523
40
  // moves are done with a single instruction per element.
524
40
  if (Tp->getScalarType()->isFP128Ty())
525
0
    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
526
40
527
40
  switch (Kind) {
528
40
  case  TargetTransformInfo::SK_ExtractSubvector:
529
0
    // ExtractSubvector Index indicates start offset.
530
0
531
0
    // Extracting a subvector from first index is a noop.
532
0
    return (Index == 0 ? 0 : NumVectors);
533
40
534
40
  case TargetTransformInfo::SK_Broadcast:
535
10
    // Loop vectorizer calls here to figure out the extra cost of
536
10
    // broadcasting a loaded value to all elements of a vector. Since vlrep
537
10
    // loads and replicates with a single instruction, adjust the returned
538
10
    // value.
539
10
    return NumVectors - 1;
540
40
541
40
  default:
542
30
543
30
    // SystemZ supports single instruction permutation / replication.
544
30
    return NumVectors;
545
0
  }
546
0
547
0
  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
548
0
}
549
550
// Return the log2 difference of the element sizes of the two vector types.
551
449
static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
552
449
  unsigned Bits0 = Ty0->getScalarSizeInBits();
553
449
  unsigned Bits1 = Ty1->getScalarSizeInBits();
554
449
555
449
  if (Bits1 >  Bits0)
556
159
    return (Log2_32(Bits1) - Log2_32(Bits0));
557
290
558
290
  return (Log2_32(Bits0) - Log2_32(Bits1));
559
290
}
560
561
// Return the number of instructions needed to truncate SrcTy to DstTy.
562
unsigned SystemZTTIImpl::
563
164
getVectorTruncCost(Type *SrcTy, Type *DstTy) {
564
164
  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
565
164
  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
566
164
          "Packing must reduce size of vector type.");
567
164
  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
568
164
          "Packing should not change number of elements.");
569
164
570
164
  // TODO: Since fp32 is expanded, the extract cost should always be 0.
571
164
572
164
  unsigned NumParts = getNumVectorRegs(SrcTy);
573
164
  if (NumParts <= 2)
574
104
    // Up to 2 vector registers can be truncated efficiently with pack or
575
104
    // permute. The latter requires an immediate mask to be loaded, which
576
104
    // typically gets hoisted out of a loop.  TODO: return a good value for
577
104
    // BB-VECTORIZER that includes the immediate loads, which we do not want
578
104
    // to count for the loop vectorizer.
579
104
    return 1;
580
60
581
60
  unsigned Cost = 0;
582
60
  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
583
60
  unsigned VF = SrcTy->getVectorNumElements();
584
169
  for (unsigned P = 0; P < Log2Diff; 
++P109
) {
585
109
    if (NumParts > 1)
586
102
      NumParts /= 2;
587
109
    Cost += NumParts;
588
109
  }
589
60
590
60
  // Currently, a general mix of permutes and pack instructions is output by
591
60
  // isel, which follow the cost computation above except for this case which
592
60
  // is one instruction less:
593
60
  if (VF == 8 && 
SrcTy->getScalarSizeInBits() == 6423
&&
594
60
      
DstTy->getScalarSizeInBits() == 823
)
595
7
    Cost--;
596
60
597
60
  return Cost;
598
60
}
599
600
// Return the cost of converting a vector bitmask produced by a compare
601
// (SrcTy), to the type of the select or extend instruction (DstTy).
602
unsigned SystemZTTIImpl::
603
340
getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
604
340
  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
605
340
          "Should only be called with vector types.");
606
340
607
340
  unsigned PackCost = 0;
608
340
  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
609
340
  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
610
340
  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
611
340
  if (SrcScalarBits > DstScalarBits)
612
140
    // The bitmask will be truncated.
613
140
    PackCost = getVectorTruncCost(SrcTy, DstTy);
614
200
  else if (SrcScalarBits < DstScalarBits) {
615
110
    unsigned DstNumParts = getNumVectorRegs(DstTy);
616
110
    // Each vector select needs its part of the bitmask unpacked.
617
110
    PackCost = Log2Diff * DstNumParts;
618
110
    // Extra cost for moving part of mask before unpacking.
619
110
    PackCost += DstNumParts - 1;
620
110
  }
621
340
622
340
  return PackCost;
623
340
}
624
625
// Return the type of the compared operands. This is needed to compute the
626
// cost for a Select / ZExt or SExt instruction.
627
388
static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
628
388
  Type *OpTy = nullptr;
629
388
  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
630
388
    OpTy = CI->getOperand(0)->getType();
631
0
  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
632
0
    if (LogicI->getNumOperands() == 2)
633
0
      if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
634
0
        if (isa<CmpInst>(LogicI->getOperand(1)))
635
0
          OpTy = CI0->getOperand(0)->getType();
636
388
637
388
  if (OpTy != nullptr) {
638
388
    if (VF == 1) {
639
48
      assert (!OpTy->isVectorTy() && "Expected scalar type");
640
48
      return OpTy;
641
48
    }
642
340
    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
643
340
    // be either scalar or already vectorized with a same or lesser VF.
644
340
    Type *ElTy = OpTy->getScalarType();
645
340
    return VectorType::get(ElTy, VF);
646
340
  }
647
0
648
0
  return nullptr;
649
0
}
650
651
// Get the cost of converting a boolean vector to a vector with same width
652
// and element size as Dst, plus the cost of zero extending if needed.
653
unsigned SystemZTTIImpl::
654
getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
655
196
                              const Instruction *I) {
656
196
  assert (Dst->isVectorTy());
657
196
  unsigned VF = Dst->getVectorNumElements();
658
196
  unsigned Cost = 0;
659
196
  // If we know what the widths of the compared operands, get any cost of
660
196
  // converting it to match Dst. Otherwise assume same widths.
661
196
  Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : 
nullptr0
);
662
196
  if (CmpOpTy != nullptr)
663
196
    Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
664
196
  if (Opcode == Instruction::ZExt || 
Opcode == Instruction::UIToFP100
)
665
98
    // One 'vn' per dst vector with an immediate mask.
666
98
    Cost += getNumVectorRegs(Dst);
667
196
  return Cost;
668
196
}
669
670
int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
671
1.33k
                                     const Instruction *I) {
672
1.33k
  unsigned DstScalarBits = Dst->getScalarSizeInBits();
673
1.33k
  unsigned SrcScalarBits = Src->getScalarSizeInBits();
674
1.33k
675
1.33k
  if (Src->isVectorTy()) {
676
663
    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
677
663
    assert (Dst->isVectorTy());
678
663
    unsigned VF = Src->getVectorNumElements();
679
663
    unsigned NumDstVectors = getNumVectorRegs(Dst);
680
663
    unsigned NumSrcVectors = getNumVectorRegs(Src);
681
663
682
663
    if (Opcode == Instruction::Trunc) {
683
26
      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
684
2
        return 0; // Check for NOOP conversions.
685
24
      return getVectorTruncCost(Src, Dst);
686
24
    }
687
637
688
637
    if (Opcode == Instruction::ZExt || 
Opcode == Instruction::SExt516
) {
689
241
      if (SrcScalarBits >= 8) {
690
49
        // ZExt/SExt will be handled with one unpack per doubling of width.
691
49
        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
692
49
693
49
        // For types that spans multiple vector registers, some additional
694
49
        // instructions are used to setup the unpacking.
695
49
        unsigned NumSrcVectorOps =
696
49
          (NumUnpacks > 1 ? 
(NumDstVectors - NumSrcVectors)25
697
49
                          : 
(NumDstVectors / 2)24
);
698
49
699
49
        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
700
49
      }
701
192
      else if (SrcScalarBits == 1)
702
192
        return getBoolVecToIntConversionCost(Opcode, Dst, I);
703
396
    }
704
396
705
396
    if (Opcode == Instruction::SIToFP || 
Opcode == Instruction::UIToFP306
||
706
396
        
Opcode == Instruction::FPToSI216
||
Opcode == Instruction::FPToUI128
) {
707
356
      // TODO: Fix base implementation which could simplify things a bit here
708
356
      // (seems to miss on differentiating on scalar/vector types).
709
356
710
356
      // Only 64 bit vector conversions are natively supported before arch13.
711
356
      if (DstScalarBits == 64 || 
ST->hasVectorEnhancements2()244
) {
712
234
        if (SrcScalarBits == DstScalarBits)
713
48
          return NumDstVectors;
714
186
715
186
        if (SrcScalarBits == 1)
716
4
          return getBoolVecToIntConversionCost(Opcode, Dst, I) + NumDstVectors;
717
304
      }
718
304
719
304
      // Return the cost of multiple scalar invocation plus the cost of
720
304
      // inserting and extracting the values. Base implementation does not
721
304
      // realize float->int gets scalarized.
722
304
      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
723
304
                                             Src->getScalarType());
724
304
      unsigned TotCost = VF * ScalarCost;
725
304
      bool NeedsInserts = true, NeedsExtracts = true;
726
304
      // FP128 registers do not get inserted or extracted.
727
304
      if (DstScalarBits == 128 &&
728
304
          
(48
Opcode == Instruction::SIToFP48
||
Opcode == Instruction::UIToFP24
))
729
48
        NeedsInserts = false;
730
304
      if (SrcScalarBits == 128 &&
731
304
          
(48
Opcode == Instruction::FPToSI48
||
Opcode == Instruction::FPToUI24
))
732
48
        NeedsExtracts = false;
733
304
734
304
      TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
735
304
      TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
736
304
737
304
      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
738
304
      if (VF == 2 && 
SrcScalarBits == 3284
&&
DstScalarBits == 3224
)
739
4
        TotCost *= 2;
740
304
741
304
      return TotCost;
742
304
    }
743
40
744
40
    if (Opcode == Instruction::FPTrunc) {
745
20
      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
746
12
        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
747
8
      else // double -> float
748
8
        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
749
20
    }
750
20
751
20
    if (Opcode == Instruction::FPExt) {
752
20
      if (SrcScalarBits == 32 && 
DstScalarBits == 6414
) {
753
8
        // float -> double is very rare and currently unoptimized. Instead of
754
8
        // using vldeb, which can do two at a time, all conversions are
755
8
        // scalarized.
756
8
        return VF * 2;
757
8
      }
758
12
      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
759
12
      return VF + getScalarizationOverhead(Src, false, true);
760
12
    }
761
675
  }
762
675
  else { // Scalar
763
675
    assert (!Dst->isVectorTy());
764
675
765
675
    if (Opcode == Instruction::SIToFP || 
Opcode == Instruction::UIToFP563
) {
766
226
      if (SrcScalarBits >= 32 ||
767
226
          
(138
I != nullptr138
&&
isa<LoadInst>(I->getOperand(0))50
))
768
112
        return 1;
769
114
      return SrcScalarBits > 1 ? 
2112
/*i8/i16 extend*/ :
52
/*branch seq.*/;
770
114
    }
771
449
772
449
    if ((Opcode == Instruction::ZExt || 
Opcode == Instruction::SExt377
) &&
773
449
        
Src->isIntegerTy(1)186
) {
774
98
      if (ST->hasLoadStoreOnCond2())
775
50
        return 2; // li 0; loc 1
776
48
777
48
      // This should be extension of a compare i1 result, which is done with
778
48
      // ipm and a varying sequence of instructions.
779
48
      unsigned Cost = 0;
780
48
      if (Opcode == Instruction::SExt)
781
24
        Cost = (DstScalarBits < 64 ? 
318
:
46
);
782
48
      if (Opcode == Instruction::ZExt)
783
24
        Cost = 3;
784
48
      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : 
nullptr0
);
785
48
      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
786
16
        // If operands of an fp-type was compared, this costs +1.
787
16
        Cost++;
788
48
      return Cost;
789
48
    }
790
449
  }
791
351
792
351
  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
793
351
}
794
795
// Scalar i8 / i16 operations will typically be made after first extending
796
// the operands to i32.
797
196
static unsigned getOperandsExtensionCost(const Instruction *I) {
798
196
  unsigned ExtCost = 0;
799
196
  for (Value *Op : I->operands())
800
392
    // A load of i8 or i16 sign/zero extends to i32.
801
392
    if (!isa<LoadInst>(Op) && 
!isa<ConstantInt>(Op)386
)
802
382
      ExtCost++;
803
196
804
196
  return ExtCost;
805
196
}
806
807
int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
808
1.34k
                                       Type *CondTy, const Instruction *I) {
809
1.34k
  if (ValTy->isVectorTy()) {
810
485
    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
811
485
    unsigned VF = ValTy->getVectorNumElements();
812
485
813
485
    // Called with a compare instruction.
814
485
    if (Opcode == Instruction::ICmp || 
Opcode == Instruction::FCmp256
) {
815
341
      unsigned PredicateExtraCost = 0;
816
341
      if (I != nullptr) {
817
341
        // Some predicates cost one or two extra instructions.
818
341
        switch (cast<CmpInst>(I)->getPredicate()) {
819
341
        case CmpInst::Predicate::ICMP_NE:
820
0
        case CmpInst::Predicate::ICMP_UGE:
821
0
        case CmpInst::Predicate::ICMP_ULE:
822
0
        case CmpInst::Predicate::ICMP_SGE:
823
0
        case CmpInst::Predicate::ICMP_SLE:
824
0
          PredicateExtraCost = 1;
825
0
          break;
826
0
        case CmpInst::Predicate::FCMP_ONE:
827
0
        case CmpInst::Predicate::FCMP_ORD:
828
0
        case CmpInst::Predicate::FCMP_UEQ:
829
0
        case CmpInst::Predicate::FCMP_UNO:
830
0
          PredicateExtraCost = 2;
831
0
          break;
832
341
        default:
833
341
          break;
834
341
        }
835
341
      }
836
341
837
341
      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
838
341
      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
839
341
      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 
1056
:
1285
);
840
341
      unsigned NumVecs_cmp = getNumVectorRegs(ValTy);
841
341
842
341
      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
843
341
      return Cost;
844
341
    }
845
144
    else { // Called with a select instruction.
846
144
      assert (Opcode == Instruction::Select);
847
144
848
144
      // We can figure out the extra cost of packing / unpacking if the
849
144
      // instruction was passed and the compare instruction is found.
850
144
      unsigned PackCost = 0;
851
144
      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : 
nullptr0
);
852
144
      if (CmpOpTy != nullptr)
853
144
        PackCost =
854
144
          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
855
144
856
144
      return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
857
144
    }
858
855
  }
859
855
  else { // Scalar
860
855
    switch (Opcode) {
861
855
    case Instruction::ICmp: {
862
413
      // A loaded value compared with 0 with multiple users becomes Load and
863
413
      // Test. The load is then not foldable, so return 0 cost for the ICmp.
864
413
      unsigned ScalarBits = ValTy->getScalarSizeInBits();
865
413
      if (I != nullptr && ScalarBits >= 32)
866
216
        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
867
12
          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
868
2
            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
869
2
                C->getZExtValue() == 0)
870
2
              return 0;
871
411
872
411
      unsigned Cost = 1;
873
411
      if (ValTy->isIntegerTy() && 
ValTy->getScalarSizeInBits() <= 16410
)
874
196
        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 
20
);
875
411
      return Cost;
876
411
    }
877
411
    case Instruction::Select:
878
254
      if (ValTy->isFloatingPointTy())
879
84
        return 4; // No load on condition for FP - costs a conditional jump.
880
170
      return 1; // Load On Condition / Select Register.
881
855
    }
882
855
  }
883
188
884
188
  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
885
188
}
886
887
int SystemZTTIImpl::
888
4.36k
getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
889
4.36k
  // vlvgp will insert two grs into a vector register, so only count half the
890
4.36k
  // number of instructions.
891
4.36k
  if (Opcode == Instruction::InsertElement && 
Val->isIntOrIntVectorTy(64)2.23k
)
892
224
    return ((Index % 2 == 0) ? 
1112
:
0112
);
893
4.13k
894
4.13k
  if (Opcode == Instruction::ExtractElement) {
895
2.12k
    int Cost = ((getScalarSizeInBits(Val) == 1) ? 
25
/*+test-under-mask*/ :
12.12k
);
896
2.12k
897
2.12k
    // Give a slight penalty for moving out of vector pipeline to FXU unit.
898
2.12k
    if (Index == 0 && 
Val->isIntOrIntVectorTy()336
)
899
219
      Cost += 1;
900
2.12k
901
2.12k
    return Cost;
902
2.12k
  }
903
2.00k
904
2.00k
  return BaseT::getVectorInstrCost(Opcode, Val, Index);
905
2.00k
}
906
907
// Check if a load may be folded as a memory operand in its user.
908
bool SystemZTTIImpl::
909
379
isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue) {
910
379
  if (!Ld->hasOneUse())
911
49
    return false;
912
330
  FoldedValue = Ld;
913
330
  const Instruction *UserI = cast<Instruction>(*Ld->user_begin());
914
330
  unsigned LoadedBits = getScalarSizeInBits(Ld->getType());
915
330
  unsigned TruncBits = 0;
916
330
  unsigned SExtBits = 0;
917
330
  unsigned ZExtBits = 0;
918
330
  if (UserI->hasOneUse()) {
919
129
    unsigned UserBits = UserI->getType()->getScalarSizeInBits();
920
129
    if (isa<TruncInst>(UserI))
921
30
      TruncBits = UserBits;
922
99
    else if (isa<SExtInst>(UserI))
923
30
      SExtBits = UserBits;
924
69
    else if (isa<ZExtInst>(UserI))
925
10
      ZExtBits = UserBits;
926
129
  }
927
330
  if (TruncBits || 
SExtBits300
||
ZExtBits270
) {
928
70
    FoldedValue = UserI;
929
70
    UserI = cast<Instruction>(*UserI->user_begin());
930
70
    // Load (single use) -> trunc/extend (single use) -> UserI
931
70
  }
932
330
  if ((UserI->getOpcode() == Instruction::Sub ||
933
330
       
UserI->getOpcode() == Instruction::SDiv274
||
934
330
       
UserI->getOpcode() == Instruction::UDiv242
) &&
935
330
      
UserI->getOperand(1) != FoldedValue116
)
936
56
    return false; // Not commutative, only RHS foldable.
937
274
  // LoadOrTruncBits holds the number of effectively loaded bits, but 0 if an
938
274
  // extension was made of the load.
939
274
  unsigned LoadOrTruncBits =
940
274
      ((SExtBits || 
ZExtBits252
) ?
030
:
(TruncBits 244
?
TruncBits24
:
LoadedBits220
));
941
274
  switch (UserI->getOpcode()) {
942
274
  case Instruction::Add: // SE: 16->32, 16/32->64, z14:16->64. ZE: 32->64
943
101
  case Instruction::Sub:
944
101
  case Instruction::ICmp:
945
101
    if (LoadedBits == 32 && 
ZExtBits == 6442
)
946
6
      return true;
947
95
    LLVM_FALLTHROUGH;
948
137
  case Instruction::Mul: // SE: 16->32, 32->64, z14:16->64
949
137
    if (UserI->getOpcode() != Instruction::ICmp) {
950
110
      if (LoadedBits == 16 &&
951
110
          
(40
SExtBits == 3240
||
952
40
           
(34
SExtBits == 6434
&&
ST->hasMiscellaneousExtensions2()6
)))
953
9
        return true;
954
101
      if (LoadOrTruncBits == 16)
955
32
        return true;
956
96
    }
957
96
    LLVM_FALLTHROUGH;
958
112
  case Instruction::SDiv:// SE: 32->64
959
112
    if (LoadedBits == 32 && 
SExtBits == 6452
)
960
10
      return true;
961
102
    LLVM_FALLTHROUGH;
962
202
  case Instruction::UDiv:
963
202
  case Instruction::And:
964
202
  case Instruction::Or:
965
202
  case Instruction::Xor:
966
202
    // This also makes sense for float operations, but disabled for now due
967
202
    // to regressions.
968
202
    // case Instruction::FCmp:
969
202
    // case Instruction::FAdd:
970
202
    // case Instruction::FSub:
971
202
    // case Instruction::FMul:
972
202
    // case Instruction::FDiv:
973
202
974
202
    // All possible extensions of memory checked above.
975
202
976
202
    // Comparison between memory and immediate.
977
202
    if (UserI->getOpcode() == Instruction::ICmp)
978
25
      if (ConstantInt *CI = dyn_cast<ConstantInt>(UserI->getOperand(1)))
979
2
        if (isUInt<16>(CI->getZExtValue()))
980
2
          return true;
981
200
    return (LoadOrTruncBits == 32 || 
LoadOrTruncBits == 6494
);
982
200
    
break0
;
983
15
  }
984
15
  return false;
985
15
}
986
987
133
static bool isBswapIntrinsicCall(const Value *V) {
988
133
  if (const Instruction *I = dyn_cast<Instruction>(V))
989
126
    if (auto *CI = dyn_cast<CallInst>(I))
990
36
      if (auto *F = CI->getCalledFunction())
991
36
        if (F->getIntrinsicID() == Intrinsic::bswap)
992
36
          return true;
993
97
  return false;
994
97
}
995
996
int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
997
                                    unsigned Alignment, unsigned AddressSpace,
998
434
                                    const Instruction *I) {
999
434
  assert(!Src->isVoidTy() && "Invalid type");
1000
434
1001
434
  if (!Src->isVectorTy() && 
Opcode == Instruction::Load358
&&
I != nullptr297
) {
1002
295
    // Store the load or its truncated or extended value in FoldedValue.
1003
295
    const Instruction *FoldedValue = nullptr;
1004
295
    if (isFoldableLoad(cast<LoadInst>(I), FoldedValue)) {
1005
184
      const Instruction *UserI = cast<Instruction>(*FoldedValue->user_begin());
1006
184
      assert (UserI->getNumOperands() == 2 && "Expected a binop.");
1007
184
1008
184
      // UserI can't fold two loads, so in that case return 0 cost only
1009
184
      // half of the time.
1010
454
      for (unsigned i = 0; i < 2; 
++i270
) {
1011
340
        if (UserI->getOperand(i) == FoldedValue)
1012
142
          continue;
1013
198
1014
198
        if (Instruction *OtherOp = dyn_cast<Instruction>(UserI->getOperand(i))){
1015
84
          LoadInst *OtherLoad = dyn_cast<LoadInst>(OtherOp);
1016
84
          if (!OtherLoad &&
1017
84
              
(0
isa<TruncInst>(OtherOp)0
||
isa<SExtInst>(OtherOp)0
||
1018
0
               isa<ZExtInst>(OtherOp)))
1019
0
            OtherLoad = dyn_cast<LoadInst>(OtherOp->getOperand(0));
1020
84
          if (OtherLoad && isFoldableLoad(OtherLoad, FoldedValue/*dummy*/))
1021
70
            return i == 0; // Both operands foldable.
1022
84
        }
1023
198
      }
1024
184
1025
184
      
return 0114
; // Only I is foldable in user.
1026
250
    }
1027
295
  }
1028
250
1029
250
  unsigned NumOps =
1030
250
    (Src->isVectorTy() ? 
getNumVectorRegs(Src)76
:
getNumberOfParts(Src)174
);
1031
250
1032
250
  // Store/Load reversed saves one instruction.
1033
250
  if (((!Src->isVectorTy() && 
NumOps == 1174
) ||
ST->hasVectorEnhancements2()76
) &&
1034
250
      
I != nullptr186
) {
1035
182
    if (Opcode == Instruction::Load && 
I->hasOneUse()117
) {
1036
68
      const Instruction *LdUser = cast<Instruction>(*I->user_begin());
1037
68
      // In case of load -> bswap -> store, return normal cost for the load.
1038
68
      if (isBswapIntrinsicCall(LdUser) &&
1039
68
          
(18
!LdUser->hasOneUse()18
||
!isa<StoreInst>(*LdUser->user_begin())9
))
1040
9
        return 0;
1041
114
    }
1042
114
    else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
1043
65
      const Value *StoredVal = SI->getValueOperand();
1044
65
      if (StoredVal->hasOneUse() && isBswapIntrinsicCall(StoredVal))
1045
18
        return 0;
1046
223
    }
1047
182
  }
1048
223
1049
223
  if (Src->getScalarSizeInBits() == 128)
1050
2
    // 128 bit scalars are held in a pair of two 64 bit registers.
1051
2
    NumOps *= 2;
1052
223
1053
223
  return  NumOps;
1054
223
}
1055
1056
// The generic implementation of getInterleavedMemoryOpCost() is based on
1057
// adding costs of the memory operations plus all the extracts and inserts
1058
// needed for using / defining the vector operands. The SystemZ version does
1059
// roughly the same but bases the computations on vector permutations
1060
// instead.
1061
int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
1062
                                               unsigned Factor,
1063
                                               ArrayRef<unsigned> Indices,
1064
                                               unsigned Alignment,
1065
                                               unsigned AddressSpace,
1066
                                               bool UseMaskForCond,
1067
0
                                               bool UseMaskForGaps) {
1068
0
  if (UseMaskForCond || UseMaskForGaps)
1069
0
    return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1070
0
                                             Alignment, AddressSpace,
1071
0
                                             UseMaskForCond, UseMaskForGaps);
1072
0
  assert(isa<VectorType>(VecTy) &&
1073
0
         "Expect a vector type for interleaved memory op");
1074
0
1075
0
  // Return the ceiling of dividing A by B.
1076
0
  auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1077
0
1078
0
  unsigned NumElts = VecTy->getVectorNumElements();
1079
0
  assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1080
0
  unsigned VF = NumElts / Factor;
1081
0
  unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
1082
0
  unsigned NumVectorMemOps = getNumVectorRegs(VecTy);
1083
0
  unsigned NumPermutes = 0;
1084
0
1085
0
  if (Opcode == Instruction::Load) {
1086
0
    // Loading interleave groups may have gaps, which may mean fewer
1087
0
    // loads. Find out how many vectors will be loaded in total, and in how
1088
0
    // many of them each value will be in.
1089
0
    BitVector UsedInsts(NumVectorMemOps, false);
1090
0
    std::vector<BitVector> ValueVecs(Factor, BitVector(NumVectorMemOps, false));
1091
0
    for (unsigned Index : Indices)
1092
0
      for (unsigned Elt = 0; Elt < VF; ++Elt) {
1093
0
        unsigned Vec = (Index + Elt * Factor) / NumEltsPerVecReg;
1094
0
        UsedInsts.set(Vec);
1095
0
        ValueVecs[Index].set(Vec);
1096
0
      }
1097
0
    NumVectorMemOps = UsedInsts.count();
1098
0
1099
0
    for (unsigned Index : Indices) {
1100
0
      // Estimate that each loaded source vector containing this Index
1101
0
      // requires one operation, except that vperm can handle two input
1102
0
      // registers first time for each dst vector.
1103
0
      unsigned NumSrcVecs = ValueVecs[Index].count();
1104
0
      unsigned NumDstVecs = ceil(VF * getScalarSizeInBits(VecTy), 128U);
1105
0
      assert (NumSrcVecs >= NumDstVecs && "Expected at least as many sources");
1106
0
      NumPermutes += std::max(1U, NumSrcVecs - NumDstVecs);
1107
0
    }
1108
0
  } else {
1109
0
    // Estimate the permutes for each stored vector as the smaller of the
1110
0
    // number of elements and the number of source vectors. Subtract one per
1111
0
    // dst vector for vperm (S.A.).
1112
0
    unsigned NumSrcVecs = std::min(NumEltsPerVecReg, Factor);
1113
0
    unsigned NumDstVecs = NumVectorMemOps;
1114
0
    assert (NumSrcVecs > 1 && "Expected at least two source vectors.");
1115
0
    NumPermutes += (NumDstVecs * NumSrcVecs) - NumDstVecs;
1116
0
  }
1117
0
1118
0
  // Cost of load/store operations and the permutations needed.
1119
0
  return NumVectorMemOps + NumPermutes;
1120
0
}
1121
1122
94
static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
1123
94
  if (RetTy->isVectorTy() && 
ID == Intrinsic::bswap36
)
1124
36
    return getNumVectorRegs(RetTy); // VPERM
1125
58
  return -1;
1126
58
}
1127
1128
int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1129
                                          ArrayRef<Value *> Args,
1130
65
                                          FastMathFlags FMF, unsigned VF) {
1131
65
  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
1132
65
  if (Cost != -1)
1133
36
    return Cost;
1134
29
  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
1135
29
}
1136
1137
int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
1138
                                          ArrayRef<Type *> Tys,
1139
                                          FastMathFlags FMF,
1140
29
                                          unsigned ScalarizationCostPassed) {
1141
29
  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
1142
29
  if (Cost != -1)
1143
0
    return Cost;
1144
29
  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
1145
29
                                      FMF, ScalarizationCostPassed);
1146
29
}