/Users/buildslave/jenkins/sharedspace/clang-stage2-coverage-R@2/llvm/lib/CodeGen/StackColoring.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===-- StackColoring.cpp -------------------------------------------------===// |
2 | | // |
3 | | // The LLVM Compiler Infrastructure |
4 | | // |
5 | | // This file is distributed under the University of Illinois Open Source |
6 | | // License. See LICENSE.TXT for details. |
7 | | // |
8 | | //===----------------------------------------------------------------------===// |
9 | | // |
10 | | // This pass implements the stack-coloring optimization that looks for |
11 | | // lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END), |
12 | | // which represent the possible lifetime of stack slots. It attempts to |
13 | | // merge disjoint stack slots and reduce the used stack space. |
14 | | // NOTE: This pass is not StackSlotColoring, which optimizes spill slots. |
15 | | // |
16 | | // TODO: In the future we plan to improve stack coloring in the following ways: |
17 | | // 1. Allow merging multiple small slots into a single larger slot at different |
18 | | // offsets. |
19 | | // 2. Merge this pass with StackSlotColoring and allow merging of allocas with |
20 | | // spill slots. |
21 | | // |
22 | | //===----------------------------------------------------------------------===// |
23 | | |
24 | | #include "llvm/ADT/BitVector.h" |
25 | | #include "llvm/ADT/DepthFirstIterator.h" |
26 | | #include "llvm/ADT/SetVector.h" |
27 | | #include "llvm/ADT/SmallPtrSet.h" |
28 | | #include "llvm/ADT/Statistic.h" |
29 | | #include "llvm/Analysis/ValueTracking.h" |
30 | | #include "llvm/CodeGen/LiveInterval.h" |
31 | | #include "llvm/CodeGen/MachineBasicBlock.h" |
32 | | #include "llvm/CodeGen/MachineFrameInfo.h" |
33 | | #include "llvm/CodeGen/MachineFunctionPass.h" |
34 | | #include "llvm/CodeGen/MachineLoopInfo.h" |
35 | | #include "llvm/CodeGen/MachineMemOperand.h" |
36 | | #include "llvm/CodeGen/MachineModuleInfo.h" |
37 | | #include "llvm/CodeGen/MachineRegisterInfo.h" |
38 | | #include "llvm/CodeGen/Passes.h" |
39 | | #include "llvm/CodeGen/PseudoSourceValue.h" |
40 | | #include "llvm/CodeGen/SelectionDAGNodes.h" |
41 | | #include "llvm/CodeGen/SlotIndexes.h" |
42 | | #include "llvm/CodeGen/StackProtector.h" |
43 | | #include "llvm/CodeGen/WinEHFuncInfo.h" |
44 | | #include "llvm/IR/DebugInfo.h" |
45 | | #include "llvm/IR/Function.h" |
46 | | #include "llvm/IR/Instructions.h" |
47 | | #include "llvm/IR/IntrinsicInst.h" |
48 | | #include "llvm/IR/Module.h" |
49 | | #include "llvm/Support/CommandLine.h" |
50 | | #include "llvm/Support/Debug.h" |
51 | | #include "llvm/Support/raw_ostream.h" |
52 | | #include "llvm/Target/TargetInstrInfo.h" |
53 | | #include "llvm/Target/TargetRegisterInfo.h" |
54 | | |
55 | | using namespace llvm; |
56 | | |
57 | | #define DEBUG_TYPE "stack-coloring" |
58 | | |
59 | | static cl::opt<bool> |
60 | | DisableColoring("no-stack-coloring", |
61 | | cl::init(false), cl::Hidden, |
62 | | cl::desc("Disable stack coloring")); |
63 | | |
64 | | /// The user may write code that uses allocas outside of the declared lifetime |
65 | | /// zone. This can happen when the user returns a reference to a local |
66 | | /// data-structure. We can detect these cases and decide not to optimize the |
67 | | /// code. If this flag is enabled, we try to save the user. This option |
68 | | /// is treated as overriding LifetimeStartOnFirstUse below. |
69 | | static cl::opt<bool> |
70 | | ProtectFromEscapedAllocas("protect-from-escaped-allocas", |
71 | | cl::init(false), cl::Hidden, |
72 | | cl::desc("Do not optimize lifetime zones that " |
73 | | "are broken")); |
74 | | |
75 | | /// Enable enhanced dataflow scheme for lifetime analysis (treat first |
76 | | /// use of stack slot as start of slot lifetime, as opposed to looking |
77 | | /// for LIFETIME_START marker). See "Implementation notes" below for |
78 | | /// more info. |
79 | | static cl::opt<bool> |
80 | | LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use", |
81 | | cl::init(true), cl::Hidden, |
82 | | cl::desc("Treat stack lifetimes as starting on first use, not on START marker.")); |
83 | | |
84 | | |
85 | | STATISTIC(NumMarkerSeen, "Number of lifetime markers found."); |
86 | | STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots."); |
87 | | STATISTIC(StackSlotMerged, "Number of stack slot merged."); |
88 | | STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region"); |
89 | | |
90 | | //===----------------------------------------------------------------------===// |
91 | | // StackColoring Pass |
92 | | //===----------------------------------------------------------------------===// |
93 | | // |
94 | | // Stack Coloring reduces stack usage by merging stack slots when they |
95 | | // can't be used together. For example, consider the following C program: |
96 | | // |
97 | | // void bar(char *, int); |
98 | | // void foo(bool var) { |
99 | | // A: { |
100 | | // char z[4096]; |
101 | | // bar(z, 0); |
102 | | // } |
103 | | // |
104 | | // char *p; |
105 | | // char x[4096]; |
106 | | // char y[4096]; |
107 | | // if (var) { |
108 | | // p = x; |
109 | | // } else { |
110 | | // bar(y, 1); |
111 | | // p = y + 1024; |
112 | | // } |
113 | | // B: |
114 | | // bar(p, 2); |
115 | | // } |
116 | | // |
117 | | // Naively-compiled, this program would use 12k of stack space. However, the |
118 | | // stack slot corresponding to `z` is always destroyed before either of the |
119 | | // stack slots for `x` or `y` are used, and then `x` is only used if `var` |
120 | | // is true, while `y` is only used if `var` is false. So in no time are 2 |
121 | | // of the stack slots used together, and therefore we can merge them, |
122 | | // compiling the function using only a single 4k alloca: |
123 | | // |
124 | | // void foo(bool var) { // equivalent |
125 | | // char x[4096]; |
126 | | // char *p; |
127 | | // bar(x, 0); |
128 | | // if (var) { |
129 | | // p = x; |
130 | | // } else { |
131 | | // bar(x, 1); |
132 | | // p = x + 1024; |
133 | | // } |
134 | | // bar(p, 2); |
135 | | // } |
136 | | // |
137 | | // This is an important optimization if we want stack space to be under |
138 | | // control in large functions, both open-coded ones and ones created by |
139 | | // inlining. |
140 | | // |
141 | | // Implementation Notes: |
142 | | // --------------------- |
143 | | // |
144 | | // An important part of the above reasoning is that `z` can't be accessed |
145 | | // while the latter 2 calls to `bar` are running. This is justified because |
146 | | // `z`'s lifetime is over after we exit from block `A:`, so any further |
147 | | // accesses to it would be UB. The way we represent this information |
148 | | // in LLVM is by having frontends delimit blocks with `lifetime.start` |
149 | | // and `lifetime.end` intrinsics. |
150 | | // |
151 | | // The effect of these intrinsics seems to be as follows (maybe I should |
152 | | // specify this in the reference?): |
153 | | // |
154 | | // L1) at start, each stack-slot is marked as *out-of-scope*, unless no |
155 | | // lifetime intrinsic refers to that stack slot, in which case |
156 | | // it is marked as *in-scope*. |
157 | | // L2) on a `lifetime.start`, a stack slot is marked as *in-scope* and |
158 | | // the stack slot is overwritten with `undef`. |
159 | | // L3) on a `lifetime.end`, a stack slot is marked as *out-of-scope*. |
160 | | // L4) on function exit, all stack slots are marked as *out-of-scope*. |
161 | | // L5) `lifetime.end` is a no-op when called on a slot that is already |
162 | | // *out-of-scope*. |
163 | | // L6) memory accesses to *out-of-scope* stack slots are UB. |
164 | | // L7) when a stack-slot is marked as *out-of-scope*, all pointers to it |
165 | | // are invalidated, unless the slot is "degenerate". This is used to |
166 | | // justify not marking slots as in-use until the pointer to them is |
167 | | // used, but feels a bit hacky in the presence of things like LICM. See |
168 | | // the "Degenerate Slots" section for more details. |
169 | | // |
170 | | // Now, let's ground stack coloring on these rules. We'll define a slot |
171 | | // as *in-use* at a (dynamic) point in execution if it either can be |
172 | | // written to at that point, or if it has a live and non-undef content |
173 | | // at that point. |
174 | | // |
175 | | // Obviously, slots that are never *in-use* together can be merged, and |
176 | | // in our example `foo`, the slots for `x`, `y` and `z` are never |
177 | | // in-use together (of course, sometimes slots that *are* in-use together |
178 | | // might still be mergable, but we don't care about that here). |
179 | | // |
180 | | // In this implementation, we successively merge pairs of slots that are |
181 | | // not *in-use* together. We could be smarter - for example, we could merge |
182 | | // a single large slot with 2 small slots, or we could construct the |
183 | | // interference graph and run a "smart" graph coloring algorithm, but with |
184 | | // that aside, how do we find out whether a pair of slots might be *in-use* |
185 | | // together? |
186 | | // |
187 | | // From our rules, we see that *out-of-scope* slots are never *in-use*, |
188 | | // and from (L7) we see that "non-degenerate" slots remain non-*in-use* |
189 | | // until their address is taken. Therefore, we can approximate slot activity |
190 | | // using dataflow. |
191 | | // |
192 | | // A subtle point: naively, we might try to figure out which pairs of |
193 | | // stack-slots interfere by propagating `S in-use` through the CFG for every |
194 | | // stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in |
195 | | // which they are both *in-use*. |
196 | | // |
197 | | // That is sound, but overly conservative in some cases: in our (artificial) |
198 | | // example `foo`, either `x` or `y` might be in use at the label `B:`, but |
199 | | // as `x` is only in use if we came in from the `var` edge and `y` only |
200 | | // if we came from the `!var` edge, they still can't be in use together. |
201 | | // See PR32488 for an important real-life case. |
202 | | // |
203 | | // If we wanted to find all points of interference precisely, we could |
204 | | // propagate `S in-use` and `S&T in-use` predicates through the CFG. That |
205 | | // would be precise, but requires propagating `O(n^2)` dataflow facts. |
206 | | // |
207 | | // However, we aren't interested in the *set* of points of interference |
208 | | // between 2 stack slots, only *whether* there *is* such a point. So we |
209 | | // can rely on a little trick: for `S` and `T` to be in-use together, |
210 | | // one of them needs to become in-use while the other is in-use (or |
211 | | // they might both become in use simultaneously). We can check this |
212 | | // by also keeping track of the points at which a stack slot might *start* |
213 | | // being in-use. |
214 | | // |
215 | | // Exact first use: |
216 | | // ---------------- |
217 | | // |
218 | | // Consider the following motivating example: |
219 | | // |
220 | | // int foo() { |
221 | | // char b1[1024], b2[1024]; |
222 | | // if (...) { |
223 | | // char b3[1024]; |
224 | | // <uses of b1, b3>; |
225 | | // return x; |
226 | | // } else { |
227 | | // char b4[1024], b5[1024]; |
228 | | // <uses of b2, b4, b5>; |
229 | | // return y; |
230 | | // } |
231 | | // } |
232 | | // |
233 | | // In the code above, "b3" and "b4" are declared in distinct lexical |
234 | | // scopes, meaning that it is easy to prove that they can share the |
235 | | // same stack slot. Variables "b1" and "b2" are declared in the same |
236 | | // scope, meaning that from a lexical point of view, their lifetimes |
237 | | // overlap. From a control flow pointer of view, however, the two |
238 | | // variables are accessed in disjoint regions of the CFG, thus it |
239 | | // should be possible for them to share the same stack slot. An ideal |
240 | | // stack allocation for the function above would look like: |
241 | | // |
242 | | // slot 0: b1, b2 |
243 | | // slot 1: b3, b4 |
244 | | // slot 2: b5 |
245 | | // |
246 | | // Achieving this allocation is tricky, however, due to the way |
247 | | // lifetime markers are inserted. Here is a simplified view of the |
248 | | // control flow graph for the code above: |
249 | | // |
250 | | // +------ block 0 -------+ |
251 | | // 0| LIFETIME_START b1, b2 | |
252 | | // 1| <test 'if' condition> | |
253 | | // +-----------------------+ |
254 | | // ./ \. |
255 | | // +------ block 1 -------+ +------ block 2 -------+ |
256 | | // 2| LIFETIME_START b3 | 5| LIFETIME_START b4, b5 | |
257 | | // 3| <uses of b1, b3> | 6| <uses of b2, b4, b5> | |
258 | | // 4| LIFETIME_END b3 | 7| LIFETIME_END b4, b5 | |
259 | | // +-----------------------+ +-----------------------+ |
260 | | // \. /. |
261 | | // +------ block 3 -------+ |
262 | | // 8| <cleanupcode> | |
263 | | // 9| LIFETIME_END b1, b2 | |
264 | | // 10| return | |
265 | | // +-----------------------+ |
266 | | // |
267 | | // If we create live intervals for the variables above strictly based |
268 | | // on the lifetime markers, we'll get the set of intervals on the |
269 | | // left. If we ignore the lifetime start markers and instead treat a |
270 | | // variable's lifetime as beginning with the first reference to the |
271 | | // var, then we get the intervals on the right. |
272 | | // |
273 | | // LIFETIME_START First Use |
274 | | // b1: [0,9] [3,4] [8,9] |
275 | | // b2: [0,9] [6,9] |
276 | | // b3: [2,4] [3,4] |
277 | | // b4: [5,7] [6,7] |
278 | | // b5: [5,7] [6,7] |
279 | | // |
280 | | // For the intervals on the left, the best we can do is overlap two |
281 | | // variables (b3 and b4, for example); this gives us a stack size of |
282 | | // 4*1024 bytes, not ideal. When treating first-use as the start of a |
283 | | // lifetime, we can additionally overlap b1 and b5, giving us a 3*1024 |
284 | | // byte stack (better). |
285 | | // |
286 | | // Degenerate Slots: |
287 | | // ----------------- |
288 | | // |
289 | | // Relying entirely on first-use of stack slots is problematic, |
290 | | // however, due to the fact that optimizations can sometimes migrate |
291 | | // uses of a variable outside of its lifetime start/end region. Here |
292 | | // is an example: |
293 | | // |
294 | | // int bar() { |
295 | | // char b1[1024], b2[1024]; |
296 | | // if (...) { |
297 | | // <uses of b2> |
298 | | // return y; |
299 | | // } else { |
300 | | // <uses of b1> |
301 | | // while (...) { |
302 | | // char b3[1024]; |
303 | | // <uses of b3> |
304 | | // } |
305 | | // } |
306 | | // } |
307 | | // |
308 | | // Before optimization, the control flow graph for the code above |
309 | | // might look like the following: |
310 | | // |
311 | | // +------ block 0 -------+ |
312 | | // 0| LIFETIME_START b1, b2 | |
313 | | // 1| <test 'if' condition> | |
314 | | // +-----------------------+ |
315 | | // ./ \. |
316 | | // +------ block 1 -------+ +------- block 2 -------+ |
317 | | // 2| <uses of b2> | 3| <uses of b1> | |
318 | | // +-----------------------+ +-----------------------+ |
319 | | // | | |
320 | | // | +------- block 3 -------+ <-\. |
321 | | // | 4| <while condition> | | |
322 | | // | +-----------------------+ | |
323 | | // | / | | |
324 | | // | / +------- block 4 -------+ |
325 | | // \ / 5| LIFETIME_START b3 | | |
326 | | // \ / 6| <uses of b3> | | |
327 | | // \ / 7| LIFETIME_END b3 | | |
328 | | // \ | +------------------------+ | |
329 | | // \ | \ / |
330 | | // +------ block 5 -----+ \--------------- |
331 | | // 8| <cleanupcode> | |
332 | | // 9| LIFETIME_END b1, b2 | |
333 | | // 10| return | |
334 | | // +---------------------+ |
335 | | // |
336 | | // During optimization, however, it can happen that an instruction |
337 | | // computing an address in "b3" (for example, a loop-invariant GEP) is |
338 | | // hoisted up out of the loop from block 4 to block 2. [Note that |
339 | | // this is not an actual load from the stack, only an instruction that |
340 | | // computes the address to be loaded]. If this happens, there is now a |
341 | | // path leading from the first use of b3 to the return instruction |
342 | | // that does not encounter the b3 LIFETIME_END, hence b3's lifetime is |
343 | | // now larger than if we were computing live intervals strictly based |
344 | | // on lifetime markers. In the example above, this lengthened lifetime |
345 | | // would mean that it would appear illegal to overlap b3 with b2. |
346 | | // |
347 | | // To deal with this such cases, the code in ::collectMarkers() below |
348 | | // tries to identify "degenerate" slots -- those slots where on a single |
349 | | // forward pass through the CFG we encounter a first reference to slot |
350 | | // K before we hit the slot K lifetime start marker. For such slots, |
351 | | // we fall back on using the lifetime start marker as the beginning of |
352 | | // the variable's lifetime. NB: with this implementation, slots can |
353 | | // appear degenerate in cases where there is unstructured control flow: |
354 | | // |
355 | | // if (q) goto mid; |
356 | | // if (x > 9) { |
357 | | // int b[100]; |
358 | | // memcpy(&b[0], ...); |
359 | | // mid: b[k] = ...; |
360 | | // abc(&b); |
361 | | // } |
362 | | // |
363 | | // If in RPO ordering chosen to walk the CFG we happen to visit the b[k] |
364 | | // before visiting the memcpy block (which will contain the lifetime start |
365 | | // for "b" then it will appear that 'b' has a degenerate lifetime. |
366 | | // |
367 | | |
368 | | namespace { |
369 | | /// StackColoring - A machine pass for merging disjoint stack allocations, |
370 | | /// marked by the LIFETIME_START and LIFETIME_END pseudo instructions. |
371 | | class StackColoring : public MachineFunctionPass { |
372 | | MachineFrameInfo *MFI; |
373 | | MachineFunction *MF; |
374 | | |
375 | | /// A class representing liveness information for a single basic block. |
376 | | /// Each bit in the BitVector represents the liveness property |
377 | | /// for a different stack slot. |
378 | | struct BlockLifetimeInfo { |
379 | | /// Which slots BEGINs in each basic block. |
380 | | BitVector Begin; |
381 | | /// Which slots ENDs in each basic block. |
382 | | BitVector End; |
383 | | /// Which slots are marked as LIVE_IN, coming into each basic block. |
384 | | BitVector LiveIn; |
385 | | /// Which slots are marked as LIVE_OUT, coming out of each basic block. |
386 | | BitVector LiveOut; |
387 | | }; |
388 | | |
389 | | /// Maps active slots (per bit) for each basic block. |
390 | | typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap; |
391 | | LivenessMap BlockLiveness; |
392 | | |
393 | | /// Maps serial numbers to basic blocks. |
394 | | DenseMap<const MachineBasicBlock*, int> BasicBlocks; |
395 | | /// Maps basic blocks to a serial number. |
396 | | SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering; |
397 | | |
398 | | /// Maps slots to their use interval. Outside of this interval, slots |
399 | | /// values are either dead or `undef` and they will not be written to. |
400 | | SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals; |
401 | | /// Maps slots to the points where they can become in-use. |
402 | | SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts; |
403 | | /// VNInfo is used for the construction of LiveIntervals. |
404 | | VNInfo::Allocator VNInfoAllocator; |
405 | | /// SlotIndex analysis object. |
406 | | SlotIndexes *Indexes; |
407 | | /// The stack protector object. |
408 | | StackProtector *SP; |
409 | | |
410 | | /// The list of lifetime markers found. These markers are to be removed |
411 | | /// once the coloring is done. |
412 | | SmallVector<MachineInstr*, 8> Markers; |
413 | | |
414 | | /// Record the FI slots for which we have seen some sort of |
415 | | /// lifetime marker (either start or end). |
416 | | BitVector InterestingSlots; |
417 | | |
418 | | /// FI slots that need to be handled conservatively (for these |
419 | | /// slots lifetime-start-on-first-use is disabled). |
420 | | BitVector ConservativeSlots; |
421 | | |
422 | | /// Number of iterations taken during data flow analysis. |
423 | | unsigned NumIterations; |
424 | | |
425 | | public: |
426 | | static char ID; |
427 | 32.0k | StackColoring() : MachineFunctionPass(ID) { |
428 | 32.0k | initializeStackColoringPass(*PassRegistry::getPassRegistry()); |
429 | 32.0k | } |
430 | | void getAnalysisUsage(AnalysisUsage &AU) const override; |
431 | | bool runOnMachineFunction(MachineFunction &MF) override; |
432 | | |
433 | | private: |
434 | | /// Debug. |
435 | | void dump() const; |
436 | | void dumpIntervals() const; |
437 | | void dumpBB(MachineBasicBlock *MBB) const; |
438 | | void dumpBV(const char *tag, const BitVector &BV) const; |
439 | | |
440 | | /// Removes all of the lifetime marker instructions from the function. |
441 | | /// \returns true if any markers were removed. |
442 | | bool removeAllMarkers(); |
443 | | |
444 | | /// Scan the machine function and find all of the lifetime markers. |
445 | | /// Record the findings in the BEGIN and END vectors. |
446 | | /// \returns the number of markers found. |
447 | | unsigned collectMarkers(unsigned NumSlot); |
448 | | |
449 | | /// Perform the dataflow calculation and calculate the lifetime for each of |
450 | | /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and |
451 | | /// LifetimeLIVE_OUT maps that represent which stack slots are live coming |
452 | | /// in and out blocks. |
453 | | void calculateLocalLiveness(); |
454 | | |
455 | | /// Returns TRUE if we're using the first-use-begins-lifetime method for |
456 | | /// this slot (if FALSE, then the start marker is treated as start of lifetime). |
457 | 1.21M | bool applyFirstUse(int Slot) { |
458 | 1.21M | if (!LifetimeStartOnFirstUse || 1.21M ProtectFromEscapedAllocas1.21M ) |
459 | 110 | return false; |
460 | 1.21M | if (1.21M ConservativeSlots.test(Slot)1.21M ) |
461 | 396k | return false; |
462 | 822k | return true; |
463 | 822k | } |
464 | | |
465 | | /// Examines the specified instruction and returns TRUE if the instruction |
466 | | /// represents the start or end of an interesting lifetime. The slot or slots |
467 | | /// starting or ending are added to the vector "slots" and "isStart" is set |
468 | | /// accordingly. |
469 | | /// \returns True if inst contains a lifetime start or end |
470 | | bool isLifetimeStartOrEnd(const MachineInstr &MI, |
471 | | SmallVector<int, 4> &slots, |
472 | | bool &isStart); |
473 | | |
474 | | /// Construct the LiveIntervals for the slots. |
475 | | void calculateLiveIntervals(unsigned NumSlots); |
476 | | |
477 | | /// Go over the machine function and change instructions which use stack |
478 | | /// slots to use the joint slots. |
479 | | void remapInstructions(DenseMap<int, int> &SlotRemap); |
480 | | |
481 | | /// The input program may contain instructions which are not inside lifetime |
482 | | /// markers. This can happen due to a bug in the compiler or due to a bug in |
483 | | /// user code (for example, returning a reference to a local variable). |
484 | | /// This procedure checks all of the instructions in the function and |
485 | | /// invalidates lifetime ranges which do not contain all of the instructions |
486 | | /// which access that frame slot. |
487 | | void removeInvalidSlotRanges(); |
488 | | |
489 | | /// Map entries which point to other entries to their destination. |
490 | | /// A->B->C becomes A->C. |
491 | | void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots); |
492 | | |
493 | | /// Used in collectMarkers |
494 | | typedef DenseMap<const MachineBasicBlock*, BitVector> BlockBitVecMap; |
495 | | }; |
496 | | } // end anonymous namespace |
497 | | |
498 | | char StackColoring::ID = 0; |
499 | | char &llvm::StackColoringID = StackColoring::ID; |
500 | | |
501 | 36.7k | INITIALIZE_PASS_BEGIN36.7k (StackColoring, DEBUG_TYPE,
|
502 | 36.7k | "Merge disjoint stack slots", false, false) |
503 | 36.7k | INITIALIZE_PASS_DEPENDENCY(SlotIndexes) |
504 | 36.7k | INITIALIZE_PASS_DEPENDENCY(StackProtector) |
505 | 36.7k | INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE, |
506 | | "Merge disjoint stack slots", false, false) |
507 | | |
508 | 32.0k | void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const { |
509 | 32.0k | AU.addRequired<SlotIndexes>(); |
510 | 32.0k | AU.addRequired<StackProtector>(); |
511 | 32.0k | MachineFunctionPass::getAnalysisUsage(AU); |
512 | 32.0k | } |
513 | | |
514 | | #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) |
515 | | LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag, |
516 | | const BitVector &BV) const { |
517 | | dbgs() << tag << " : { "; |
518 | | for (unsigned I = 0, E = BV.size(); I != E; ++I) |
519 | | dbgs() << BV.test(I) << " "; |
520 | | dbgs() << "}\n"; |
521 | | } |
522 | | |
523 | | LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const { |
524 | | LivenessMap::const_iterator BI = BlockLiveness.find(MBB); |
525 | | assert(BI != BlockLiveness.end() && "Block not found"); |
526 | | const BlockLifetimeInfo &BlockInfo = BI->second; |
527 | | |
528 | | dumpBV("BEGIN", BlockInfo.Begin); |
529 | | dumpBV("END", BlockInfo.End); |
530 | | dumpBV("LIVE_IN", BlockInfo.LiveIn); |
531 | | dumpBV("LIVE_OUT", BlockInfo.LiveOut); |
532 | | } |
533 | | |
534 | | LLVM_DUMP_METHOD void StackColoring::dump() const { |
535 | | for (MachineBasicBlock *MBB : depth_first(MF)) { |
536 | | dbgs() << "Inspecting block #" << MBB->getNumber() << " [" |
537 | | << MBB->getName() << "]\n"; |
538 | | dumpBB(MBB); |
539 | | } |
540 | | } |
541 | | |
542 | | LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const { |
543 | | for (unsigned I = 0, E = Intervals.size(); I != E; ++I) { |
544 | | dbgs() << "Interval[" << I << "]:\n"; |
545 | | Intervals[I]->dump(); |
546 | | } |
547 | | } |
548 | | #endif |
549 | | |
550 | | static inline int getStartOrEndSlot(const MachineInstr &MI) |
551 | 458k | { |
552 | 458k | assert((MI.getOpcode() == TargetOpcode::LIFETIME_START || |
553 | 458k | MI.getOpcode() == TargetOpcode::LIFETIME_END) && |
554 | 458k | "Expected LIFETIME_START or LIFETIME_END op"); |
555 | 458k | const MachineOperand &MO = MI.getOperand(0); |
556 | 458k | int Slot = MO.getIndex(); |
557 | 458k | if (Slot >= 0) |
558 | 458k | return Slot; |
559 | 2 | return -1; |
560 | 2 | } |
561 | | |
562 | | // |
563 | | // At the moment the only way to end a variable lifetime is with |
564 | | // a VARIABLE_LIFETIME op (which can't contain a start). If things |
565 | | // change and the IR allows for a single inst that both begins |
566 | | // and ends lifetime(s), this interface will need to be reworked. |
567 | | // |
568 | | bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI, |
569 | | SmallVector<int, 4> &slots, |
570 | | bool &isStart) |
571 | 22.8M | { |
572 | 22.8M | if (MI.getOpcode() == TargetOpcode::LIFETIME_START || |
573 | 22.8M | MI.getOpcode() == TargetOpcode::LIFETIME_END22.7M ) { |
574 | 296k | int Slot = getStartOrEndSlot(MI); |
575 | 296k | if (Slot < 0) |
576 | 1 | return false; |
577 | 296k | if (296k !InterestingSlots.test(Slot)296k ) |
578 | 0 | return false; |
579 | 296k | slots.push_back(Slot); |
580 | 296k | if (MI.getOpcode() == TargetOpcode::LIFETIME_END296k ) { |
581 | 168k | isStart = false; |
582 | 168k | return true; |
583 | 168k | } |
584 | 127k | if (127k ! applyFirstUse(Slot)127k ) { |
585 | 37.2k | isStart = true; |
586 | 37.2k | return true; |
587 | 37.2k | } |
588 | 22.5M | } else if (22.5M LifetimeStartOnFirstUse && 22.5M !ProtectFromEscapedAllocas22.5M ) { |
589 | 22.5M | if (! MI.isDebugValue()22.5M ) { |
590 | 22.5M | bool found = false; |
591 | 70.9M | for (const MachineOperand &MO : MI.operands()) { |
592 | 70.9M | if (!MO.isFI()) |
593 | 69.8M | continue; |
594 | 1.13M | int Slot = MO.getIndex(); |
595 | 1.13M | if (Slot<0) |
596 | 2.73k | continue; |
597 | 1.13M | if (1.13M InterestingSlots.test(Slot) && 1.13M applyFirstUse(Slot)1.09M ) { |
598 | 732k | slots.push_back(Slot); |
599 | 732k | found = true; |
600 | 732k | } |
601 | 70.9M | } |
602 | 22.5M | if (found22.5M ) { |
603 | 732k | isStart = true; |
604 | 732k | return true; |
605 | 732k | } |
606 | 21.8M | } |
607 | 22.5M | } |
608 | 21.8M | return false; |
609 | 21.8M | } |
610 | | |
611 | | unsigned StackColoring::collectMarkers(unsigned NumSlot) |
612 | 50.0k | { |
613 | 50.0k | unsigned MarkersFound = 0; |
614 | 50.0k | BlockBitVecMap SeenStartMap; |
615 | 50.0k | InterestingSlots.clear(); |
616 | 50.0k | InterestingSlots.resize(NumSlot); |
617 | 50.0k | ConservativeSlots.clear(); |
618 | 50.0k | ConservativeSlots.resize(NumSlot); |
619 | 50.0k | |
620 | 50.0k | // number of start and end lifetime ops for each slot |
621 | 50.0k | SmallVector<int, 8> NumStartLifetimes(NumSlot, 0); |
622 | 50.0k | SmallVector<int, 8> NumEndLifetimes(NumSlot, 0); |
623 | 50.0k | |
624 | 50.0k | // Step 1: collect markers and populate the "InterestingSlots" |
625 | 50.0k | // and "ConservativeSlots" sets. |
626 | 1.22M | for (MachineBasicBlock *MBB : depth_first(MF)) { |
627 | 1.22M | |
628 | 1.22M | // Compute the set of slots for which we've seen a START marker but have |
629 | 1.22M | // not yet seen an END marker at this point in the walk (e.g. on entry |
630 | 1.22M | // to this bb). |
631 | 1.22M | BitVector BetweenStartEnd; |
632 | 1.22M | BetweenStartEnd.resize(NumSlot); |
633 | 1.22M | for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), |
634 | 3.11M | PE = MBB->pred_end(); PI != PE3.11M ; ++PI1.88M ) { |
635 | 1.88M | BlockBitVecMap::const_iterator I = SeenStartMap.find(*PI); |
636 | 1.88M | if (I != SeenStartMap.end()1.88M ) { |
637 | 1.30M | BetweenStartEnd |= I->second; |
638 | 1.30M | } |
639 | 1.88M | } |
640 | 1.22M | |
641 | 1.22M | // Walk the instructions in the block to look for start/end ops. |
642 | 12.3M | for (MachineInstr &MI : *MBB) { |
643 | 12.3M | if (MI.getOpcode() == TargetOpcode::LIFETIME_START || |
644 | 12.3M | MI.getOpcode() == TargetOpcode::LIFETIME_END12.2M ) { |
645 | 162k | int Slot = getStartOrEndSlot(MI); |
646 | 162k | if (Slot < 0) |
647 | 1 | continue; |
648 | 162k | InterestingSlots.set(Slot); |
649 | 162k | if (MI.getOpcode() == TargetOpcode::LIFETIME_START162k ) { |
650 | 69.2k | BetweenStartEnd.set(Slot); |
651 | 69.2k | NumStartLifetimes[Slot] += 1; |
652 | 162k | } else { |
653 | 92.9k | BetweenStartEnd.reset(Slot); |
654 | 92.9k | NumEndLifetimes[Slot] += 1; |
655 | 92.9k | } |
656 | 162k | const AllocaInst *Allocation = MFI->getObjectAllocation(Slot); |
657 | 162k | if (Allocation162k ) { |
658 | 162k | DEBUG(dbgs() << "Found a lifetime "); |
659 | 162k | DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START |
660 | 162k | ? "start" |
661 | 162k | : "end")); |
662 | 162k | DEBUG(dbgs() << " marker for slot #" << Slot); |
663 | 162k | DEBUG(dbgs() << " with allocation: " << Allocation->getName() |
664 | 162k | << "\n"); |
665 | 162k | } |
666 | 162k | Markers.push_back(&MI); |
667 | 162k | MarkersFound += 1; |
668 | 12.3M | } else { |
669 | 38.7M | for (const MachineOperand &MO : MI.operands()) { |
670 | 38.7M | if (!MO.isFI()) |
671 | 38.0M | continue; |
672 | 637k | int Slot = MO.getIndex(); |
673 | 637k | if (Slot < 0) |
674 | 7.88k | continue; |
675 | 629k | if (629k ! BetweenStartEnd.test(Slot)629k ) { |
676 | 67.8k | ConservativeSlots.set(Slot); |
677 | 67.8k | } |
678 | 38.7M | } |
679 | 12.1M | } |
680 | 12.3M | } |
681 | 1.22M | BitVector &SeenStart = SeenStartMap[MBB]; |
682 | 1.22M | SeenStart |= BetweenStartEnd; |
683 | 1.22M | } |
684 | 50.0k | if (!MarkersFound50.0k ) { |
685 | 8.46k | return 0; |
686 | 8.46k | } |
687 | 41.6k | |
688 | 41.6k | // PR27903: slots with multiple start or end lifetime ops are not |
689 | 41.6k | // safe to enable for "lifetime-start-on-first-use". |
690 | 116k | for (unsigned slot = 0; 41.6k slot < NumSlot116k ; ++slot74.6k ) |
691 | 74.6k | if (74.6k NumStartLifetimes[slot] > 1 || 74.6k NumEndLifetimes[slot] > 174.0k ) |
692 | 20.4k | ConservativeSlots.set(slot); |
693 | 41.6k | DEBUG(dumpBV("Conservative slots", ConservativeSlots)); |
694 | 41.6k | |
695 | 41.6k | // Step 2: compute begin/end sets for each block |
696 | 41.6k | |
697 | 41.6k | // NOTE: We use a depth-first iteration to ensure that we obtain a |
698 | 41.6k | // deterministic numbering. |
699 | 1.19M | for (MachineBasicBlock *MBB : depth_first(MF)) { |
700 | 1.19M | |
701 | 1.19M | // Assign a serial number to this basic block. |
702 | 1.19M | BasicBlocks[MBB] = BasicBlockNumbering.size(); |
703 | 1.19M | BasicBlockNumbering.push_back(MBB); |
704 | 1.19M | |
705 | 1.19M | // Keep a reference to avoid repeated lookups. |
706 | 1.19M | BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB]; |
707 | 1.19M | |
708 | 1.19M | BlockInfo.Begin.resize(NumSlot); |
709 | 1.19M | BlockInfo.End.resize(NumSlot); |
710 | 1.19M | |
711 | 1.19M | SmallVector<int, 4> slots; |
712 | 11.8M | for (MachineInstr &MI : *MBB) { |
713 | 11.8M | bool isStart = false; |
714 | 11.8M | slots.clear(); |
715 | 11.8M | if (isLifetimeStartOrEnd(MI, slots, isStart)11.8M ) { |
716 | 487k | if (!isStart487k ) { |
717 | 92.9k | assert(slots.size() == 1 && "unexpected: MI ends multiple slots"); |
718 | 92.9k | int Slot = slots[0]; |
719 | 92.9k | if (BlockInfo.Begin.test(Slot)92.9k ) { |
720 | 17.6k | BlockInfo.Begin.reset(Slot); |
721 | 17.6k | } |
722 | 92.9k | BlockInfo.End.set(Slot); |
723 | 487k | } else { |
724 | 394k | for (auto Slot : slots) { |
725 | 394k | DEBUG(dbgs() << "Found a use of slot #" << Slot); |
726 | 394k | DEBUG(dbgs() << " at BB#" << MBB->getNumber() << " index "); |
727 | 394k | DEBUG(Indexes->getInstructionIndex(MI).print(dbgs())); |
728 | 394k | const AllocaInst *Allocation = MFI->getObjectAllocation(Slot); |
729 | 394k | if (Allocation394k ) { |
730 | 394k | DEBUG(dbgs() << " with allocation: "<< Allocation->getName()); |
731 | 394k | } |
732 | 394k | DEBUG(dbgs() << "\n"); |
733 | 394k | if (BlockInfo.End.test(Slot)394k ) { |
734 | 302 | BlockInfo.End.reset(Slot); |
735 | 302 | } |
736 | 394k | BlockInfo.Begin.set(Slot); |
737 | 394k | } |
738 | 394k | } |
739 | 487k | } |
740 | 11.8M | } |
741 | 1.19M | } |
742 | 50.0k | |
743 | 50.0k | // Update statistics. |
744 | 50.0k | NumMarkerSeen += MarkersFound; |
745 | 50.0k | return MarkersFound; |
746 | 50.0k | } |
747 | | |
748 | | void StackColoring::calculateLocalLiveness() |
749 | 33.7k | { |
750 | 33.7k | unsigned NumIters = 0; |
751 | 33.7k | bool changed = true; |
752 | 99.9k | while (changed99.9k ) { |
753 | 66.2k | changed = false; |
754 | 66.2k | ++NumIters; |
755 | 66.2k | |
756 | 2.56M | for (const MachineBasicBlock *BB : BasicBlockNumbering) { |
757 | 2.56M | |
758 | 2.56M | // Use an iterator to avoid repeated lookups. |
759 | 2.56M | LivenessMap::iterator BI = BlockLiveness.find(BB); |
760 | 2.56M | assert(BI != BlockLiveness.end() && "Block not found"); |
761 | 2.56M | BlockLifetimeInfo &BlockInfo = BI->second; |
762 | 2.56M | |
763 | 2.56M | // Compute LiveIn by unioning together the LiveOut sets of all preds. |
764 | 2.56M | BitVector LocalLiveIn; |
765 | 2.56M | for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(), |
766 | 6.57M | PE = BB->pred_end(); PI != PE6.57M ; ++PI4.01M ) { |
767 | 4.01M | LivenessMap::const_iterator I = BlockLiveness.find(*PI); |
768 | 4.01M | assert(I != BlockLiveness.end() && "Predecessor not found"); |
769 | 4.01M | LocalLiveIn |= I->second.LiveOut; |
770 | 4.01M | } |
771 | 2.56M | |
772 | 2.56M | // Compute LiveOut by subtracting out lifetimes that end in this |
773 | 2.56M | // block, then adding in lifetimes that begin in this block. If |
774 | 2.56M | // we have both BEGIN and END markers in the same basic block |
775 | 2.56M | // then we know that the BEGIN marker comes after the END, |
776 | 2.56M | // because we already handle the case where the BEGIN comes |
777 | 2.56M | // before the END when collecting the markers (and building the |
778 | 2.56M | // BEGIN/END vectors). |
779 | 2.56M | BitVector LocalLiveOut = LocalLiveIn; |
780 | 2.56M | LocalLiveOut.reset(BlockInfo.End); |
781 | 2.56M | LocalLiveOut |= BlockInfo.Begin; |
782 | 2.56M | |
783 | 2.56M | // Update block LiveIn set, noting whether it has changed. |
784 | 2.56M | if (LocalLiveIn.test(BlockInfo.LiveIn)2.56M ) { |
785 | 793k | changed = true; |
786 | 793k | BlockInfo.LiveIn |= LocalLiveIn; |
787 | 793k | } |
788 | 2.56M | |
789 | 2.56M | // Update block LiveOut set, noting whether it has changed. |
790 | 2.56M | if (LocalLiveOut.test(BlockInfo.LiveOut)2.56M ) { |
791 | 782k | changed = true; |
792 | 782k | BlockInfo.LiveOut |= LocalLiveOut; |
793 | 782k | } |
794 | 2.56M | } |
795 | 66.2k | }// while changed. |
796 | 33.7k | |
797 | 33.7k | NumIterations = NumIters; |
798 | 33.7k | } |
799 | | |
800 | 33.7k | void StackColoring::calculateLiveIntervals(unsigned NumSlots) { |
801 | 33.7k | SmallVector<SlotIndex, 16> Starts; |
802 | 33.7k | SmallVector<bool, 16> DefinitelyInUse; |
803 | 33.7k | |
804 | 33.7k | // For each block, find which slots are active within this block |
805 | 33.7k | // and update the live intervals. |
806 | 1.10M | for (const MachineBasicBlock &MBB : *MF) { |
807 | 1.10M | Starts.clear(); |
808 | 1.10M | Starts.resize(NumSlots); |
809 | 1.10M | DefinitelyInUse.clear(); |
810 | 1.10M | DefinitelyInUse.resize(NumSlots); |
811 | 1.10M | |
812 | 1.10M | // Start the interval of the slots that we previously found to be 'in-use'. |
813 | 1.10M | BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB]; |
814 | 2.35M | for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1; |
815 | 1.24M | pos = MBBLiveness.LiveIn.find_next(pos)1.24M ) { |
816 | 1.24M | Starts[pos] = Indexes->getMBBStartIdx(&MBB); |
817 | 1.24M | } |
818 | 1.10M | |
819 | 1.10M | // Create the interval for the basic blocks containing lifetime begin/end. |
820 | 11.0M | for (const MachineInstr &MI : MBB) { |
821 | 11.0M | |
822 | 11.0M | SmallVector<int, 4> slots; |
823 | 11.0M | bool IsStart = false; |
824 | 11.0M | if (!isLifetimeStartOrEnd(MI, slots, IsStart)) |
825 | 10.5M | continue; |
826 | 450k | SlotIndex ThisIndex = Indexes->getInstructionIndex(MI); |
827 | 450k | for (auto Slot : slots) { |
828 | 450k | if (IsStart450k ) { |
829 | 375k | // If a slot is already definitely in use, we don't have to emit |
830 | 375k | // a new start marker because there is already a pre-existing |
831 | 375k | // one. |
832 | 375k | if (!DefinitelyInUse[Slot]375k ) { |
833 | 263k | LiveStarts[Slot].push_back(ThisIndex); |
834 | 263k | DefinitelyInUse[Slot] = true; |
835 | 263k | } |
836 | 375k | if (!Starts[Slot].isValid()) |
837 | 53.7k | Starts[Slot] = ThisIndex; |
838 | 450k | } else { |
839 | 75.8k | if (Starts[Slot].isValid()75.8k ) { |
840 | 75.7k | VNInfo *VNI = Intervals[Slot]->getValNumInfo(0); |
841 | 75.7k | Intervals[Slot]->addSegment( |
842 | 75.7k | LiveInterval::Segment(Starts[Slot], ThisIndex, VNI)); |
843 | 75.7k | Starts[Slot] = SlotIndex(); // Invalidate the start index |
844 | 75.7k | DefinitelyInUse[Slot] = false; |
845 | 75.7k | } |
846 | 75.8k | } |
847 | 450k | } |
848 | 11.0M | } |
849 | 1.10M | |
850 | 1.10M | // Finish up started segments |
851 | 4.89M | for (unsigned i = 0; i < NumSlots4.89M ; ++i3.78M ) { |
852 | 3.78M | if (!Starts[i].isValid()) |
853 | 2.55M | continue; |
854 | 1.22M | |
855 | 1.22M | SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB); |
856 | 1.22M | VNInfo *VNI = Intervals[i]->getValNumInfo(0); |
857 | 1.22M | Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI)); |
858 | 1.22M | } |
859 | 1.10M | } |
860 | 33.7k | } |
861 | | |
862 | 50.0k | bool StackColoring::removeAllMarkers() { |
863 | 50.0k | unsigned Count = 0; |
864 | 162k | for (MachineInstr *MI : Markers) { |
865 | 162k | MI->eraseFromParent(); |
866 | 162k | Count++; |
867 | 162k | } |
868 | 50.0k | Markers.clear(); |
869 | 50.0k | |
870 | 50.0k | DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n"); |
871 | 50.0k | return Count; |
872 | 50.0k | } |
873 | | |
874 | 33.7k | void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) { |
875 | 33.7k | unsigned FixedInstr = 0; |
876 | 33.7k | unsigned FixedMemOp = 0; |
877 | 33.7k | unsigned FixedDbg = 0; |
878 | 33.7k | |
879 | 33.7k | // Remap debug information that refers to stack slots. |
880 | 5 | for (auto &VI : MF->getVariableDbgInfo()) { |
881 | 5 | if (!VI.Var) |
882 | 0 | continue; |
883 | 5 | if (5 SlotRemap.count(VI.Slot)5 ) { |
884 | 0 | DEBUG(dbgs() << "Remapping debug info for [" |
885 | 0 | << cast<DILocalVariable>(VI.Var)->getName() << "].\n"); |
886 | 0 | VI.Slot = SlotRemap[VI.Slot]; |
887 | 0 | FixedDbg++; |
888 | 0 | } |
889 | 5 | } |
890 | 33.7k | |
891 | 33.7k | // Keep a list of *allocas* which need to be remapped. |
892 | 33.7k | DenseMap<const AllocaInst*, const AllocaInst*> Allocas; |
893 | 33.7k | |
894 | 33.7k | // Keep a list of allocas which has been affected by the remap. |
895 | 33.7k | SmallPtrSet<const AllocaInst*, 32> MergedAllocas; |
896 | 33.7k | |
897 | 8.36k | for (const std::pair<int, int> &SI : SlotRemap) { |
898 | 8.36k | const AllocaInst *From = MFI->getObjectAllocation(SI.first); |
899 | 8.36k | const AllocaInst *To = MFI->getObjectAllocation(SI.second); |
900 | 8.36k | assert(To && From && "Invalid allocation object"); |
901 | 8.36k | Allocas[From] = To; |
902 | 8.36k | |
903 | 8.36k | // AA might be used later for instruction scheduling, and we need it to be |
904 | 8.36k | // able to deduce the correct aliasing releationships between pointers |
905 | 8.36k | // derived from the alloca being remapped and the target of that remapping. |
906 | 8.36k | // The only safe way, without directly informing AA about the remapping |
907 | 8.36k | // somehow, is to directly update the IR to reflect the change being made |
908 | 8.36k | // here. |
909 | 8.36k | Instruction *Inst = const_cast<AllocaInst *>(To); |
910 | 8.36k | if (From->getType() != To->getType()8.36k ) { |
911 | 7.07k | BitCastInst *Cast = new BitCastInst(Inst, From->getType()); |
912 | 7.07k | Cast->insertAfter(Inst); |
913 | 7.07k | Inst = Cast; |
914 | 7.07k | } |
915 | 8.36k | |
916 | 8.36k | // We keep both slots to maintain AliasAnalysis metadata later. |
917 | 8.36k | MergedAllocas.insert(From); |
918 | 8.36k | MergedAllocas.insert(To); |
919 | 8.36k | |
920 | 8.36k | // Allow the stack protector to adjust its value map to account for the |
921 | 8.36k | // upcoming replacement. |
922 | 8.36k | SP->adjustForColoring(From, To); |
923 | 8.36k | |
924 | 8.36k | // The new alloca might not be valid in a llvm.dbg.declare for this |
925 | 8.36k | // variable, so undef out the use to make the verifier happy. |
926 | 8.36k | AllocaInst *FromAI = const_cast<AllocaInst *>(From); |
927 | 8.36k | if (FromAI->isUsedByMetadata()) |
928 | 0 | ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType())); |
929 | 231k | for (auto &Use : FromAI->uses()) { |
930 | 231k | if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get())) |
931 | 0 | if (0 BCI->isUsedByMetadata()0 ) |
932 | 0 | ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType())); |
933 | 231k | } |
934 | 8.36k | |
935 | 8.36k | // Note that this will not replace uses in MMOs (which we'll update below), |
936 | 8.36k | // or anywhere else (which is why we won't delete the original |
937 | 8.36k | // instruction). |
938 | 8.36k | FromAI->replaceAllUsesWith(Inst); |
939 | 8.36k | } |
940 | 33.7k | |
941 | 33.7k | // Remap all instructions to the new stack slots. |
942 | 33.7k | for (MachineBasicBlock &BB : *MF) |
943 | 1.10M | for (MachineInstr &I : BB) 1.10M { |
944 | 11.0M | // Skip lifetime markers. We'll remove them soon. |
945 | 11.0M | if (I.getOpcode() == TargetOpcode::LIFETIME_START || |
946 | 10.9M | I.getOpcode() == TargetOpcode::LIFETIME_END) |
947 | 133k | continue; |
948 | 10.8M | |
949 | 10.8M | // Update the MachineMemOperand to use the new alloca. |
950 | 10.8M | for (MachineMemOperand *MMO : I.memoperands()) 10.8M { |
951 | 1.31M | // We've replaced IR-level uses of the remapped allocas, so we only |
952 | 1.31M | // need to replace direct uses here. |
953 | 1.31M | const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue()); |
954 | 1.31M | if (!AI) |
955 | 1.27M | continue; |
956 | 41.8k | |
957 | 41.8k | if (41.8k !Allocas.count(AI)41.8k ) |
958 | 39.5k | continue; |
959 | 2.28k | |
960 | 2.28k | MMO->setValue(Allocas[AI]); |
961 | 2.28k | FixedMemOp++; |
962 | 2.28k | } |
963 | 10.8M | |
964 | 10.8M | // Update all of the machine instruction operands. |
965 | 34.2M | for (MachineOperand &MO : I.operands()) { |
966 | 34.2M | if (!MO.isFI()) |
967 | 33.6M | continue; |
968 | 553k | int FromSlot = MO.getIndex(); |
969 | 553k | |
970 | 553k | // Don't touch arguments. |
971 | 553k | if (FromSlot<0) |
972 | 1.00k | continue; |
973 | 552k | |
974 | 552k | // Only look at mapped slots. |
975 | 552k | if (552k !SlotRemap.count(FromSlot)552k ) |
976 | 413k | continue; |
977 | 139k | |
978 | 139k | // In a debug build, check that the instruction that we are modifying is |
979 | 139k | // inside the expected live range. If the instruction is not inside |
980 | 139k | // the calculated range then it means that the alloca usage moved |
981 | 139k | // outside of the lifetime markers, or that the user has a bug. |
982 | 139k | // NOTE: Alloca address calculations which happen outside the lifetime |
983 | 139k | // zone are are okay, despite the fact that we don't have a good way |
984 | 139k | // for validating all of the usages of the calculation. |
985 | | #ifndef NDEBUG |
986 | | bool TouchesMemory = I.mayLoad() || I.mayStore(); |
987 | | // If we *don't* protect the user from escaped allocas, don't bother |
988 | | // validating the instructions. |
989 | | if (!I.isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) { |
990 | | SlotIndex Index = Indexes->getInstructionIndex(I); |
991 | | const LiveInterval *Interval = &*Intervals[FromSlot]; |
992 | | assert(Interval->find(Index) != Interval->end() && |
993 | | "Found instruction usage outside of live range."); |
994 | | } |
995 | | #endif |
996 | | |
997 | 139k | // Fix the machine instructions. |
998 | 139k | int ToSlot = SlotRemap[FromSlot]; |
999 | 139k | MO.setIndex(ToSlot); |
1000 | 139k | FixedInstr++; |
1001 | 139k | } |
1002 | 10.8M | |
1003 | 10.8M | // We adjust AliasAnalysis information for merged stack slots. |
1004 | 10.8M | MachineSDNode::mmo_iterator NewMemOps = |
1005 | 10.8M | MF->allocateMemRefsArray(I.getNumMemOperands()); |
1006 | 10.8M | unsigned MemOpIdx = 0; |
1007 | 10.8M | bool ReplaceMemOps = false; |
1008 | 1.31M | for (MachineMemOperand *MMO : I.memoperands()) { |
1009 | 1.31M | // If this memory location can be a slot remapped here, |
1010 | 1.31M | // we remove AA information. |
1011 | 1.31M | bool MayHaveConflictingAAMD = false; |
1012 | 1.31M | if (MMO->getAAInfo()1.31M ) { |
1013 | 1.05M | if (const Value *MMOV1.05M = MMO->getValue()) { |
1014 | 1.05M | SmallVector<Value *, 4> Objs; |
1015 | 1.05M | getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout()); |
1016 | 1.05M | |
1017 | 1.05M | if (Objs.empty()) |
1018 | 561k | MayHaveConflictingAAMD = true; |
1019 | 1.05M | else |
1020 | 489k | for (Value *V : Objs) 489k { |
1021 | 492k | // If this memory location comes from a known stack slot |
1022 | 492k | // that is not remapped, we continue checking. |
1023 | 492k | // Otherwise, we need to invalidate AA infomation. |
1024 | 492k | const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V); |
1025 | 492k | if (AI && 492k MergedAllocas.count(AI)346k ) { |
1026 | 89.2k | MayHaveConflictingAAMD = true; |
1027 | 89.2k | break; |
1028 | 89.2k | } |
1029 | 1.31M | } |
1030 | 1.05M | } |
1031 | 1.05M | } |
1032 | 1.31M | if (MayHaveConflictingAAMD1.31M ) { |
1033 | 650k | NewMemOps[MemOpIdx++] = MF->getMachineMemOperand(MMO, AAMDNodes()); |
1034 | 650k | ReplaceMemOps = true; |
1035 | 650k | } |
1036 | 1.31M | else |
1037 | 668k | NewMemOps[MemOpIdx++] = MMO; |
1038 | 1.31M | } |
1039 | 10.8M | |
1040 | 10.8M | // If any memory operand is updated, set memory references of |
1041 | 10.8M | // this instruction. |
1042 | 10.8M | if (ReplaceMemOps) |
1043 | 650k | I.setMemRefs(std::make_pair(NewMemOps, I.getNumMemOperands())); |
1044 | 1.10M | } |
1045 | 33.7k | |
1046 | 33.7k | // Update the location of C++ catch objects for the MSVC personality routine. |
1047 | 33.7k | if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo()) |
1048 | 1 | for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap) |
1049 | 1 | for (WinEHHandlerType &H : TBME.HandlerArray) |
1050 | 1 | if (1 H.CatchObj.FrameIndex != INT_MAX && |
1051 | 0 | SlotRemap.count(H.CatchObj.FrameIndex)) |
1052 | 0 | H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex]; |
1053 | 33.7k | |
1054 | 33.7k | DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n"); |
1055 | 33.7k | DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n"); |
1056 | 33.7k | DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n"); |
1057 | 33.7k | } |
1058 | | |
1059 | 0 | void StackColoring::removeInvalidSlotRanges() { |
1060 | 0 | for (MachineBasicBlock &BB : *MF) |
1061 | 0 | for (MachineInstr &I : BB) 0 { |
1062 | 0 | if (I.getOpcode() == TargetOpcode::LIFETIME_START || |
1063 | 0 | I.getOpcode() == TargetOpcode::LIFETIME_END0 || I.isDebugValue()0 ) |
1064 | 0 | continue; |
1065 | 0 |
|
1066 | 0 | // Some intervals are suspicious! In some cases we find address |
1067 | 0 | // calculations outside of the lifetime zone, but not actual memory |
1068 | 0 | // read or write. Memory accesses outside of the lifetime zone are a clear |
1069 | 0 | // violation, but address calculations are okay. This can happen when |
1070 | 0 | // GEPs are hoisted outside of the lifetime zone. |
1071 | 0 | // So, in here we only check instructions which can read or write memory. |
1072 | 0 | if (0 !I.mayLoad() && 0 !I.mayStore()0 ) |
1073 | 0 | continue; |
1074 | 0 |
|
1075 | 0 | // Check all of the machine operands. |
1076 | 0 | for (const MachineOperand &MO : I.operands()) 0 { |
1077 | 0 | if (!MO.isFI()) |
1078 | 0 | continue; |
1079 | 0 |
|
1080 | 0 | int Slot = MO.getIndex(); |
1081 | 0 |
|
1082 | 0 | if (Slot<0) |
1083 | 0 | continue; |
1084 | 0 |
|
1085 | 0 | if (0 Intervals[Slot]->empty()0 ) |
1086 | 0 | continue; |
1087 | 0 |
|
1088 | 0 | // Check that the used slot is inside the calculated lifetime range. |
1089 | 0 | // If it is not, warn about it and invalidate the range. |
1090 | 0 | LiveInterval *Interval = &*Intervals[Slot]; |
1091 | 0 | SlotIndex Index = Indexes->getInstructionIndex(I); |
1092 | 0 | if (Interval->find(Index) == Interval->end()0 ) { |
1093 | 0 | Interval->clear(); |
1094 | 0 | DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n"); |
1095 | 0 | EscapedAllocas++; |
1096 | 0 | } |
1097 | 0 | } |
1098 | 0 | } |
1099 | 0 | } |
1100 | | |
1101 | | void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap, |
1102 | 33.7k | unsigned NumSlots) { |
1103 | 33.7k | // Expunge slot remap map. |
1104 | 97.2k | for (unsigned i=0; i < NumSlots97.2k ; ++i63.5k ) { |
1105 | 63.5k | // If we are remapping i |
1106 | 63.5k | if (SlotRemap.count(i)63.5k ) { |
1107 | 8.36k | int Target = SlotRemap[i]; |
1108 | 8.36k | // As long as our target is mapped to something else, follow it. |
1109 | 8.36k | while (SlotRemap.count(Target)8.36k ) { |
1110 | 0 | Target = SlotRemap[Target]; |
1111 | 0 | SlotRemap[i] = Target; |
1112 | 0 | } |
1113 | 8.36k | } |
1114 | 63.5k | } |
1115 | 33.7k | } |
1116 | | |
1117 | 588k | bool StackColoring::runOnMachineFunction(MachineFunction &Func) { |
1118 | 588k | DEBUG(dbgs() << "********** Stack Coloring **********\n" |
1119 | 588k | << "********** Function: " |
1120 | 588k | << ((const Value*)Func.getFunction())->getName() << '\n'); |
1121 | 588k | MF = &Func; |
1122 | 588k | MFI = &MF->getFrameInfo(); |
1123 | 588k | Indexes = &getAnalysis<SlotIndexes>(); |
1124 | 588k | SP = &getAnalysis<StackProtector>(); |
1125 | 588k | BlockLiveness.clear(); |
1126 | 588k | BasicBlocks.clear(); |
1127 | 588k | BasicBlockNumbering.clear(); |
1128 | 588k | Markers.clear(); |
1129 | 588k | Intervals.clear(); |
1130 | 588k | LiveStarts.clear(); |
1131 | 588k | VNInfoAllocator.Reset(); |
1132 | 588k | |
1133 | 588k | unsigned NumSlots = MFI->getObjectIndexEnd(); |
1134 | 588k | |
1135 | 588k | // If there are no stack slots then there are no markers to remove. |
1136 | 588k | if (!NumSlots) |
1137 | 538k | return false; |
1138 | 50.0k | |
1139 | 50.0k | SmallVector<int, 8> SortedSlots; |
1140 | 50.0k | SortedSlots.reserve(NumSlots); |
1141 | 50.0k | Intervals.reserve(NumSlots); |
1142 | 50.0k | LiveStarts.resize(NumSlots); |
1143 | 50.0k | |
1144 | 50.0k | unsigned NumMarkers = collectMarkers(NumSlots); |
1145 | 50.0k | |
1146 | 50.0k | unsigned TotalSize = 0; |
1147 | 50.0k | DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n"); |
1148 | 50.0k | DEBUG(dbgs()<<"Slot structure:\n"); |
1149 | 50.0k | |
1150 | 142k | for (int i=0; i < MFI->getObjectIndexEnd()142k ; ++i92.1k ) { |
1151 | 92.1k | DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n"); |
1152 | 92.1k | TotalSize += MFI->getObjectSize(i); |
1153 | 92.1k | } |
1154 | 50.0k | |
1155 | 50.0k | DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n"); |
1156 | 50.0k | |
1157 | 50.0k | // Don't continue because there are not enough lifetime markers, or the |
1158 | 50.0k | // stack is too small, or we are told not to optimize the slots. |
1159 | 50.0k | if (NumMarkers < 2 || 50.0k TotalSize < 1641.5k || DisableColoring33.7k || |
1160 | 50.0k | skipFunction(*Func.getFunction())33.7k ) { |
1161 | 16.3k | DEBUG(dbgs()<<"Will not try to merge slots.\n"); |
1162 | 16.3k | return removeAllMarkers(); |
1163 | 16.3k | } |
1164 | 33.7k | |
1165 | 97.2k | for (unsigned i=0; 33.7k i < NumSlots97.2k ; ++i63.5k ) { |
1166 | 63.5k | std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0)); |
1167 | 63.5k | LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator); |
1168 | 63.5k | Intervals.push_back(std::move(LI)); |
1169 | 63.5k | SortedSlots.push_back(i); |
1170 | 63.5k | } |
1171 | 33.7k | |
1172 | 33.7k | // Calculate the liveness of each block. |
1173 | 33.7k | calculateLocalLiveness(); |
1174 | 33.7k | DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n"); |
1175 | 33.7k | DEBUG(dump()); |
1176 | 33.7k | |
1177 | 33.7k | // Propagate the liveness information. |
1178 | 33.7k | calculateLiveIntervals(NumSlots); |
1179 | 33.7k | DEBUG(dumpIntervals()); |
1180 | 33.7k | |
1181 | 33.7k | // Search for allocas which are used outside of the declared lifetime |
1182 | 33.7k | // markers. |
1183 | 33.7k | if (ProtectFromEscapedAllocas) |
1184 | 0 | removeInvalidSlotRanges(); |
1185 | 33.7k | |
1186 | 33.7k | // Maps old slots to new slots. |
1187 | 33.7k | DenseMap<int, int> SlotRemap; |
1188 | 33.7k | unsigned RemovedSlots = 0; |
1189 | 33.7k | unsigned ReducedSize = 0; |
1190 | 33.7k | |
1191 | 33.7k | // Do not bother looking at empty intervals. |
1192 | 97.2k | for (unsigned I = 0; I < NumSlots97.2k ; ++I63.5k ) { |
1193 | 63.5k | if (Intervals[SortedSlots[I]]->empty()) |
1194 | 6.34k | SortedSlots[I] = -1; |
1195 | 63.5k | } |
1196 | 33.7k | |
1197 | 33.7k | // This is a simple greedy algorithm for merging allocas. First, sort the |
1198 | 33.7k | // slots, placing the largest slots first. Next, perform an n^2 scan and look |
1199 | 33.7k | // for disjoint slots. When you find disjoint slots, merge the samller one |
1200 | 33.7k | // into the bigger one and update the live interval. Remove the small alloca |
1201 | 33.7k | // and continue. |
1202 | 33.7k | |
1203 | 33.7k | // Sort the slots according to their size. Place unused slots at the end. |
1204 | 33.7k | // Use stable sort to guarantee deterministic code generation. |
1205 | 33.7k | std::stable_sort(SortedSlots.begin(), SortedSlots.end(), |
1206 | 63.6k | [this](int LHS, int RHS) { |
1207 | 63.6k | // We use -1 to denote a uninteresting slot. Place these slots at the end. |
1208 | 63.6k | if (LHS == -163.6k ) return false505 ; |
1209 | 63.1k | if (63.1k RHS == -163.1k ) return true13.9k ; |
1210 | 49.2k | // Sort according to size. |
1211 | 49.2k | return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS); |
1212 | 49.2k | }); |
1213 | 33.7k | |
1214 | 33.7k | for (auto &s : LiveStarts) |
1215 | 63.5k | std::sort(s.begin(), s.end()); |
1216 | 33.7k | |
1217 | 33.7k | bool Changed = true; |
1218 | 71.0k | while (Changed71.0k ) { |
1219 | 37.3k | Changed = false; |
1220 | 120k | for (unsigned I = 0; I < NumSlots120k ; ++I83.2k ) { |
1221 | 83.2k | if (SortedSlots[I] == -1) |
1222 | 23.4k | continue; |
1223 | 59.7k | |
1224 | 146k | for (unsigned J=I+1; 59.7k J < NumSlots146k ; ++J87.0k ) { |
1225 | 87.0k | if (SortedSlots[J] == -1) |
1226 | 31.8k | continue; |
1227 | 55.1k | |
1228 | 55.1k | int FirstSlot = SortedSlots[I]; |
1229 | 55.1k | int SecondSlot = SortedSlots[J]; |
1230 | 55.1k | LiveInterval *First = &*Intervals[FirstSlot]; |
1231 | 55.1k | LiveInterval *Second = &*Intervals[SecondSlot]; |
1232 | 55.1k | auto &FirstS = LiveStarts[FirstSlot]; |
1233 | 55.1k | auto &SecondS = LiveStarts[SecondSlot]; |
1234 | 55.1k | assert (!First->empty() && !Second->empty() && "Found an empty range"); |
1235 | 55.1k | |
1236 | 55.1k | // Merge disjoint slots. This is a little bit tricky - see the |
1237 | 55.1k | // Implementation Notes section for an explanation. |
1238 | 55.1k | if (!First->isLiveAtIndexes(SecondS) && |
1239 | 55.1k | !Second->isLiveAtIndexes(FirstS)16.2k ) { |
1240 | 8.36k | Changed = true; |
1241 | 8.36k | First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0)); |
1242 | 8.36k | |
1243 | 8.36k | int OldSize = FirstS.size(); |
1244 | 8.36k | FirstS.append(SecondS.begin(), SecondS.end()); |
1245 | 8.36k | auto Mid = FirstS.begin() + OldSize; |
1246 | 8.36k | std::inplace_merge(FirstS.begin(), Mid, FirstS.end()); |
1247 | 8.36k | |
1248 | 8.36k | SlotRemap[SecondSlot] = FirstSlot; |
1249 | 8.36k | SortedSlots[J] = -1; |
1250 | 8.36k | DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<< |
1251 | 8.36k | SecondSlot<<" together.\n"); |
1252 | 8.36k | unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot), |
1253 | 8.36k | MFI->getObjectAlignment(SecondSlot)); |
1254 | 8.36k | |
1255 | 8.36k | assert(MFI->getObjectSize(FirstSlot) >= |
1256 | 8.36k | MFI->getObjectSize(SecondSlot) && |
1257 | 8.36k | "Merging a small object into a larger one"); |
1258 | 8.36k | |
1259 | 8.36k | RemovedSlots+=1; |
1260 | 8.36k | ReducedSize += MFI->getObjectSize(SecondSlot); |
1261 | 8.36k | MFI->setObjectAlignment(FirstSlot, MaxAlignment); |
1262 | 8.36k | MFI->RemoveStackObject(SecondSlot); |
1263 | 8.36k | } |
1264 | 87.0k | } |
1265 | 83.2k | } |
1266 | 37.3k | }// While changed. |
1267 | 33.7k | |
1268 | 33.7k | // Record statistics. |
1269 | 33.7k | StackSpaceSaved += ReducedSize; |
1270 | 33.7k | StackSlotMerged += RemovedSlots; |
1271 | 33.7k | DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<< |
1272 | 588k | ReducedSize<<" bytes\n"); |
1273 | 588k | |
1274 | 588k | // Scan the entire function and update all machine operands that use frame |
1275 | 588k | // indices to use the remapped frame index. |
1276 | 588k | expungeSlotMap(SlotRemap, NumSlots); |
1277 | 588k | remapInstructions(SlotRemap); |
1278 | 588k | |
1279 | 588k | return removeAllMarkers(); |
1280 | 588k | } |