/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- NVPTXTargetTransformInfo.cpp - NVPTX specific TTI -----------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "NVPTXTargetTransformInfo.h" |
10 | | #include "NVPTXUtilities.h" |
11 | | #include "llvm/Analysis/LoopInfo.h" |
12 | | #include "llvm/Analysis/TargetTransformInfo.h" |
13 | | #include "llvm/Analysis/ValueTracking.h" |
14 | | #include "llvm/CodeGen/BasicTTIImpl.h" |
15 | | #include "llvm/CodeGen/CostTable.h" |
16 | | #include "llvm/CodeGen/TargetLowering.h" |
17 | | #include "llvm/Support/Debug.h" |
18 | | using namespace llvm; |
19 | | |
20 | | #define DEBUG_TYPE "NVPTXtti" |
21 | | |
22 | | // Whether the given intrinsic reads threadIdx.x/y/z. |
23 | 16 | static bool readsThreadIndex(const IntrinsicInst *II) { |
24 | 16 | switch (II->getIntrinsicID()) { |
25 | 16 | default: return false2 ; |
26 | 16 | case Intrinsic::nvvm_read_ptx_sreg_tid_x: |
27 | 14 | case Intrinsic::nvvm_read_ptx_sreg_tid_y: |
28 | 14 | case Intrinsic::nvvm_read_ptx_sreg_tid_z: |
29 | 14 | return true; |
30 | 16 | } |
31 | 16 | } |
32 | | |
33 | 2 | static bool readsLaneId(const IntrinsicInst *II) { |
34 | 2 | return II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_laneid; |
35 | 2 | } |
36 | | |
37 | | // Whether the given intrinsic is an atomic instruction in PTX. |
38 | | static bool isNVVMAtomic(const IntrinsicInst *II) { |
39 | | switch (II->getIntrinsicID()) { |
40 | | default: return false; |
41 | | case Intrinsic::nvvm_atomic_load_inc_32: |
42 | | case Intrinsic::nvvm_atomic_load_dec_32: |
43 | | |
44 | | case Intrinsic::nvvm_atomic_add_gen_f_cta: |
45 | | case Intrinsic::nvvm_atomic_add_gen_f_sys: |
46 | | case Intrinsic::nvvm_atomic_add_gen_i_cta: |
47 | | case Intrinsic::nvvm_atomic_add_gen_i_sys: |
48 | | case Intrinsic::nvvm_atomic_and_gen_i_cta: |
49 | | case Intrinsic::nvvm_atomic_and_gen_i_sys: |
50 | | case Intrinsic::nvvm_atomic_cas_gen_i_cta: |
51 | | case Intrinsic::nvvm_atomic_cas_gen_i_sys: |
52 | | case Intrinsic::nvvm_atomic_dec_gen_i_cta: |
53 | | case Intrinsic::nvvm_atomic_dec_gen_i_sys: |
54 | | case Intrinsic::nvvm_atomic_inc_gen_i_cta: |
55 | | case Intrinsic::nvvm_atomic_inc_gen_i_sys: |
56 | | case Intrinsic::nvvm_atomic_max_gen_i_cta: |
57 | | case Intrinsic::nvvm_atomic_max_gen_i_sys: |
58 | | case Intrinsic::nvvm_atomic_min_gen_i_cta: |
59 | | case Intrinsic::nvvm_atomic_min_gen_i_sys: |
60 | | case Intrinsic::nvvm_atomic_or_gen_i_cta: |
61 | | case Intrinsic::nvvm_atomic_or_gen_i_sys: |
62 | | case Intrinsic::nvvm_atomic_exch_gen_i_cta: |
63 | | case Intrinsic::nvvm_atomic_exch_gen_i_sys: |
64 | | case Intrinsic::nvvm_atomic_xor_gen_i_cta: |
65 | | case Intrinsic::nvvm_atomic_xor_gen_i_sys: |
66 | | return true; |
67 | | } |
68 | | } |
69 | | |
70 | 208 | bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) { |
71 | 208 | // Without inter-procedural analysis, we conservatively assume that arguments |
72 | 208 | // to __device__ functions are divergent. |
73 | 208 | if (const Argument *Arg = dyn_cast<Argument>(V)) |
74 | 30 | return !isKernelFunction(*Arg->getParent()); |
75 | 178 | |
76 | 178 | if (const Instruction *I = dyn_cast<Instruction>(V)) { |
77 | 178 | // Without pointer analysis, we conservatively assume values loaded from |
78 | 178 | // generic or local address space are divergent. |
79 | 178 | if (const LoadInst *LI = dyn_cast<LoadInst>(I)) { |
80 | 0 | unsigned AS = LI->getPointerAddressSpace(); |
81 | 0 | return AS == ADDRESS_SPACE_GENERIC || AS == ADDRESS_SPACE_LOCAL; |
82 | 0 | } |
83 | 178 | // Atomic instructions may cause divergence. Atomic instructions are |
84 | 178 | // executed sequentially across all threads in a warp. Therefore, an earlier |
85 | 178 | // executed thread may see different memory inputs than a later executed |
86 | 178 | // thread. For example, suppose *a = 0 initially. |
87 | 178 | // |
88 | 178 | // atom.global.add.s32 d, [a], 1 |
89 | 178 | // |
90 | 178 | // returns 0 for the first thread that enters the critical region, and 1 for |
91 | 178 | // the second thread. |
92 | 178 | if (I->isAtomic()) |
93 | 0 | return true; |
94 | 178 | if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) { |
95 | 16 | // Instructions that read threadIdx are obviously divergent. |
96 | 16 | if (readsThreadIndex(II) || readsLaneId(II)2 ) |
97 | 16 | return true; |
98 | 0 | // Handle the NVPTX atomic instrinsics that cannot be represented as an |
99 | 0 | // atomic IR instruction. |
100 | 0 | if (isNVVMAtomic(II)) |
101 | 0 | return true; |
102 | 162 | } |
103 | 162 | // Conservatively consider the return value of function calls as divergent. |
104 | 162 | // We could analyze callees with bodies more precisely using |
105 | 162 | // inter-procedural analysis. |
106 | 162 | if (isa<CallInst>(I)) |
107 | 0 | return true; |
108 | 162 | } |
109 | 162 | |
110 | 162 | return false; |
111 | 162 | } |
112 | | |
113 | | int NVPTXTTIImpl::getArithmeticInstrCost( |
114 | | unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info, |
115 | | TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo, |
116 | 13 | TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args) { |
117 | 13 | // Legalize the type. |
118 | 13 | std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); |
119 | 13 | |
120 | 13 | int ISD = TLI->InstructionOpcodeToISD(Opcode); |
121 | 13 | |
122 | 13 | switch (ISD) { |
123 | 13 | default: |
124 | 10 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
125 | 10 | Opd1PropInfo, Opd2PropInfo); |
126 | 13 | case ISD::ADD: |
127 | 3 | case ISD::MUL: |
128 | 3 | case ISD::XOR: |
129 | 3 | case ISD::OR: |
130 | 3 | case ISD::AND: |
131 | 3 | // The machine code (SASS) simulates an i64 with two i32. Therefore, we |
132 | 3 | // estimate that arithmetic operations on i64 are twice as expensive as |
133 | 3 | // those on types that can fit into one machine register. |
134 | 3 | if (LT.second.SimpleTy == MVT::i64) |
135 | 0 | return 2 * LT.first; |
136 | 3 | // Delegate other cases to the basic TTI. |
137 | 3 | return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, |
138 | 3 | Opd1PropInfo, Opd2PropInfo); |
139 | 13 | } |
140 | 13 | } |
141 | | |
142 | | void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, |
143 | 2 | TTI::UnrollingPreferences &UP) { |
144 | 2 | BaseT::getUnrollingPreferences(L, SE, UP); |
145 | 2 | |
146 | 2 | // Enable partial unrolling and runtime unrolling, but reduce the |
147 | 2 | // threshold. This partially unrolls small loops which are often |
148 | 2 | // unrolled by the PTX to SASS compiler and unrolling earlier can be |
149 | 2 | // beneficial. |
150 | 2 | UP.Partial = UP.Runtime = true; |
151 | 2 | UP.PartialThreshold = UP.Threshold / 4; |
152 | 2 | } |