/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Tooling/Transformer/SourceCode.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file provides functions that simplify extraction of source code. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | #include "clang/Tooling/Transformer/SourceCode.h" |
13 | | #include "clang/AST/ASTContext.h" |
14 | | #include "clang/AST/Attr.h" |
15 | | #include "clang/AST/Comment.h" |
16 | | #include "clang/AST/Decl.h" |
17 | | #include "clang/AST/DeclCXX.h" |
18 | | #include "clang/AST/DeclTemplate.h" |
19 | | #include "clang/AST/Expr.h" |
20 | | #include "clang/Basic/SourceManager.h" |
21 | | #include "clang/Lex/Lexer.h" |
22 | | #include "llvm/Support/Errc.h" |
23 | | #include "llvm/Support/Error.h" |
24 | | #include <set> |
25 | | |
26 | | using namespace clang; |
27 | | |
28 | | using llvm::errc; |
29 | | using llvm::StringError; |
30 | | |
31 | | StringRef clang::tooling::getText(CharSourceRange Range, |
32 | 200 | const ASTContext &Context) { |
33 | 200 | return Lexer::getSourceText(Range, Context.getSourceManager(), |
34 | 200 | Context.getLangOpts()); |
35 | 200 | } |
36 | | |
37 | | CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range, |
38 | | tok::TokenKind Next, |
39 | 83 | ASTContext &Context) { |
40 | 83 | CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(), |
41 | 83 | Context.getLangOpts()); |
42 | 83 | if (R.isInvalid()) |
43 | 1 | return Range; |
44 | 82 | Token Tok; |
45 | 82 | bool Err = |
46 | 82 | Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(), |
47 | 82 | Context.getLangOpts(), /*IgnoreWhiteSpace=*/true); |
48 | 82 | if (Err || !Tok.is(Next)) |
49 | 24 | return Range; |
50 | 58 | return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation()); |
51 | 82 | } |
52 | | |
53 | | static llvm::Error validateRange(const CharSourceRange &Range, |
54 | | const SourceManager &SM, |
55 | 192 | bool AllowSystemHeaders) { |
56 | 192 | if (Range.isInvalid()) |
57 | 10 | return llvm::make_error<StringError>(errc::invalid_argument, |
58 | 10 | "Invalid range"); |
59 | | |
60 | 182 | if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID()178 ) |
61 | 4 | return llvm::make_error<StringError>( |
62 | 4 | errc::invalid_argument, "Range starts or ends in a macro expansion"); |
63 | | |
64 | 178 | if (!AllowSystemHeaders) { |
65 | 178 | if (SM.isInSystemHeader(Range.getBegin()) || |
66 | 178 | SM.isInSystemHeader(Range.getEnd())) |
67 | 0 | return llvm::make_error<StringError>(errc::invalid_argument, |
68 | 0 | "Range is in system header"); |
69 | 178 | } |
70 | | |
71 | 178 | std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin()); |
72 | 178 | std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd()); |
73 | 178 | if (BeginInfo.first != EndInfo.first) |
74 | 0 | return llvm::make_error<StringError>( |
75 | 0 | errc::invalid_argument, "Range begins and ends in different files"); |
76 | | |
77 | 178 | if (BeginInfo.second > EndInfo.second) |
78 | 1 | return llvm::make_error<StringError>(errc::invalid_argument, |
79 | 1 | "Range's begin is past its end"); |
80 | | |
81 | 177 | return llvm::Error::success(); |
82 | 178 | } |
83 | | |
84 | | llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range, |
85 | 192 | const SourceManager &SM) { |
86 | 192 | return validateRange(Range, SM, /*AllowSystemHeaders=*/false); |
87 | 192 | } |
88 | | |
89 | | static bool spelledInMacroDefinition(SourceLocation Loc, |
90 | 11 | const SourceManager &SM) { |
91 | 16 | while (Loc.isMacroID()) { |
92 | 9 | const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion(); |
93 | 9 | if (Expansion.isMacroArgExpansion()) { |
94 | | // Check the spelling location of the macro arg, in case the arg itself is |
95 | | // in a macro expansion. |
96 | 5 | Loc = Expansion.getSpellingLoc(); |
97 | 5 | } else { |
98 | 4 | return true; |
99 | 4 | } |
100 | 9 | } |
101 | 7 | return false; |
102 | 11 | } |
103 | | |
104 | | static CharSourceRange getRange(const CharSourceRange &EditRange, |
105 | | const SourceManager &SM, |
106 | | const LangOptions &LangOpts, |
107 | 142 | bool IncludeMacroExpansion) { |
108 | 142 | CharSourceRange Range; |
109 | 142 | if (IncludeMacroExpansion) { |
110 | 135 | Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts); |
111 | 135 | } else { |
112 | 7 | if (spelledInMacroDefinition(EditRange.getBegin(), SM) || |
113 | 7 | spelledInMacroDefinition(EditRange.getEnd(), SM)4 ) |
114 | 4 | return {}; |
115 | | |
116 | 3 | auto B = SM.getSpellingLoc(EditRange.getBegin()); |
117 | 3 | auto E = SM.getSpellingLoc(EditRange.getEnd()); |
118 | 3 | if (EditRange.isTokenRange()) |
119 | 3 | E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts); |
120 | 3 | Range = CharSourceRange::getCharRange(B, E); |
121 | 3 | } |
122 | 138 | return Range; |
123 | 142 | } |
124 | | |
125 | | std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit( |
126 | | const CharSourceRange &EditRange, const SourceManager &SM, |
127 | 142 | const LangOptions &LangOpts, bool IncludeMacroExpansion) { |
128 | 142 | CharSourceRange Range = |
129 | 142 | getRange(EditRange, SM, LangOpts, IncludeMacroExpansion); |
130 | 142 | bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM)); |
131 | 142 | if (IsInvalid) |
132 | 7 | return std::nullopt; |
133 | 135 | return Range; |
134 | 142 | } |
135 | | |
136 | | std::optional<CharSourceRange> clang::tooling::getFileRange( |
137 | | const CharSourceRange &EditRange, const SourceManager &SM, |
138 | 0 | const LangOptions &LangOpts, bool IncludeMacroExpansion) { |
139 | 0 | CharSourceRange Range = |
140 | 0 | getRange(EditRange, SM, LangOpts, IncludeMacroExpansion); |
141 | 0 | bool IsInvalid = |
142 | 0 | llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true)); |
143 | 0 | if (IsInvalid) |
144 | 0 | return std::nullopt; |
145 | 0 | return Range; |
146 | 0 | } |
147 | | |
148 | 15 | static bool startsWithNewline(const SourceManager &SM, const Token &Tok) { |
149 | 15 | return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]); |
150 | 15 | } |
151 | | |
152 | | static bool contains(const std::set<tok::TokenKind> &Terminators, |
153 | 81 | const Token &Tok) { |
154 | 81 | return Terminators.count(Tok.getKind()) > 0; |
155 | 81 | } |
156 | | |
157 | | // Returns the exclusive, *file* end location of the entity whose last token is |
158 | | // at location 'EntityLast'. That is, it returns the location one past the last |
159 | | // relevant character. |
160 | | // |
161 | | // Associated tokens include comments, horizontal whitespace and 'Terminators' |
162 | | // -- optional tokens, which, if any are found, will be included; if |
163 | | // 'Terminators' is empty, we will not include any extra tokens beyond comments |
164 | | // and horizontal whitespace. |
165 | | static SourceLocation |
166 | | getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast, |
167 | | const std::set<tok::TokenKind> &Terminators, |
168 | 42 | const LangOptions &LangOpts) { |
169 | 42 | assert(EntityLast.isValid() && "Invalid end location found."); |
170 | | |
171 | | // We remember the last location of a non-horizontal-whitespace token we have |
172 | | // lexed; this is the location up to which we will want to delete. |
173 | | // FIXME: Support using the spelling loc here for cases where we want to |
174 | | // analyze the macro text. |
175 | | |
176 | 42 | CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast); |
177 | | // FIXME: Should check isTokenRange(), for the (rare) case that |
178 | | // `ExpansionRange` is a character range. |
179 | 42 | std::unique_ptr<Lexer> Lexer = [&]() { |
180 | 42 | bool Invalid = false; |
181 | 42 | auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd()); |
182 | 42 | llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid); |
183 | 42 | assert(!Invalid && "Cannot get file/offset"); |
184 | 42 | return std::make_unique<clang::Lexer>( |
185 | 42 | SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(), |
186 | 42 | File.data() + FileOffset.second, File.end()); |
187 | 42 | }(); |
188 | | |
189 | | // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown). |
190 | 42 | Lexer->SetKeepWhitespaceMode(true); |
191 | | |
192 | | // Generally, the code we want to include looks like this ([] are optional), |
193 | | // If Terminators is empty: |
194 | | // [ <comment> ] [ <newline> ] |
195 | | // Otherwise: |
196 | | // ... <terminator> [ <comment> ] [ <newline> ] |
197 | | |
198 | 42 | Token Tok; |
199 | 42 | bool Terminated = false; |
200 | | |
201 | | // First, lex to the current token (which is the last token of the range that |
202 | | // is definitely associated with the decl). Then, we process the first token |
203 | | // separately from the rest based on conditions that hold specifically for |
204 | | // that first token. |
205 | | // |
206 | | // We do not search for a terminator if none is required or we've already |
207 | | // encountered it. Otherwise, if the original `EntityLast` location was in a |
208 | | // macro expansion, we don't have visibility into the text, so we assume we've |
209 | | // already terminated. However, we note this assumption with |
210 | | // `TerminatedByMacro`, because we'll want to handle it somewhat differently |
211 | | // for the terminators semicolon and comma. These terminators can be safely |
212 | | // associated with the entity when they appear after the macro -- extra |
213 | | // semicolons have no effect on the program and a well-formed program won't |
214 | | // have multiple commas in a row, so we're guaranteed that there is only one. |
215 | | // |
216 | | // FIXME: This handling of macros is more conservative than necessary. When |
217 | | // the end of the expansion coincides with the end of the node, we can still |
218 | | // safely analyze the code. But, it is more complicated, because we need to |
219 | | // start by lexing the spelling loc for the first token and then switch to the |
220 | | // expansion loc. |
221 | 42 | bool TerminatedByMacro = false; |
222 | 42 | Lexer->LexFromRawLexer(Tok); |
223 | 42 | if (Terminators.empty() || contains(Terminators, Tok)) |
224 | 3 | Terminated = true; |
225 | 39 | else if (EntityLast.isMacroID()) { |
226 | 3 | Terminated = true; |
227 | 3 | TerminatedByMacro = true; |
228 | 3 | } |
229 | | |
230 | | // We save the most recent candidate for the exclusive end location. |
231 | 42 | SourceLocation End = Tok.getEndLoc(); |
232 | | |
233 | 78 | while (!Terminated) { |
234 | | // Lex the next token we want to possibly expand the range with. |
235 | 36 | Lexer->LexFromRawLexer(Tok); |
236 | | |
237 | 36 | switch (Tok.getKind()) { |
238 | 0 | case tok::eof: |
239 | | // Unexpected separators. |
240 | 0 | case tok::l_brace: |
241 | 0 | case tok::r_brace: |
242 | 0 | case tok::comma: |
243 | 0 | return End; |
244 | | // Whitespace pseudo-tokens. |
245 | 0 | case tok::unknown: |
246 | 0 | if (startsWithNewline(SM, Tok)) |
247 | | // Include at least until the end of the line. |
248 | 0 | End = Tok.getEndLoc(); |
249 | 0 | break; |
250 | 36 | default: |
251 | 36 | if (contains(Terminators, Tok)) |
252 | 36 | Terminated = true; |
253 | 36 | End = Tok.getEndLoc(); |
254 | 36 | break; |
255 | 36 | } |
256 | 36 | } |
257 | | |
258 | 51 | do 42 { |
259 | | // Lex the next token we want to possibly expand the range with. |
260 | 51 | Lexer->LexFromRawLexer(Tok); |
261 | | |
262 | 51 | switch (Tok.getKind()) { |
263 | 15 | case tok::unknown: |
264 | 15 | if (startsWithNewline(SM, Tok)) |
265 | | // We're done, but include this newline. |
266 | 11 | return Tok.getEndLoc(); |
267 | 4 | break; |
268 | 4 | case tok::comment: |
269 | | // Include any comments we find on the way. |
270 | 2 | End = Tok.getEndLoc(); |
271 | 2 | break; |
272 | 3 | case tok::semi: |
273 | 3 | case tok::comma: |
274 | 3 | if (TerminatedByMacro && contains(Terminators, Tok)) { |
275 | 3 | End = Tok.getEndLoc(); |
276 | | // We've found a real terminator. |
277 | 3 | TerminatedByMacro = false; |
278 | 3 | break; |
279 | 3 | } |
280 | | // Found an unrelated token; stop and don't include it. |
281 | 0 | return End; |
282 | 31 | default: |
283 | | // Found an unrelated token; stop and don't include it. |
284 | 31 | return End; |
285 | 51 | } |
286 | 51 | } while (true9 ); |
287 | 42 | } |
288 | | |
289 | | // Returns the expected terminator tokens for the given declaration. |
290 | | // |
291 | | // If we do not know the correct terminator token, returns an empty set. |
292 | | // |
293 | | // There are cases where we have more than one possible terminator (for example, |
294 | | // we find either a comma or a semicolon after a VarDecl). |
295 | 42 | static std::set<tok::TokenKind> getTerminators(const Decl &D) { |
296 | 42 | if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D)36 ) |
297 | 6 | return {tok::semi}; |
298 | | |
299 | 36 | if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D)31 ) |
300 | 5 | return {tok::r_brace, tok::semi}; |
301 | | |
302 | 31 | if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D)0 ) |
303 | 31 | return {tok::comma, tok::semi}; |
304 | | |
305 | 0 | return {}; |
306 | 31 | } |
307 | | |
308 | | // Starting from `Loc`, skips whitespace up to, and including, a single |
309 | | // newline. Returns the (exclusive) end of any skipped whitespace (that is, the |
310 | | // location immediately after the whitespace). |
311 | | static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM, |
312 | | SourceLocation Loc, |
313 | 14 | const LangOptions &LangOpts) { |
314 | 14 | const char *LocChars = SM.getCharacterData(Loc); |
315 | 14 | int i = 0; |
316 | 16 | while (isHorizontalWhitespace(LocChars[i])) |
317 | 2 | ++i; |
318 | 14 | if (isVerticalWhitespace(LocChars[i])) |
319 | 14 | ++i; |
320 | 14 | return Loc.getLocWithOffset(i); |
321 | 14 | } |
322 | | |
323 | | // Is `Loc` separated from any following decl by something meaningful (e.g. an |
324 | | // empty line, a comment), ignoring horizontal whitespace? Since this is a |
325 | | // heuristic, we return false when in doubt. `Loc` cannot be the first location |
326 | | // in the file. |
327 | | static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc, |
328 | 26 | const LangOptions &LangOpts) { |
329 | | // If the preceding character is a newline, we'll check for an empty line as a |
330 | | // separator. However, we can't identify an empty line using tokens, so we |
331 | | // analyse the characters. If we try to use tokens, we'll just end up with a |
332 | | // whitespace token, whose characters we'd have to analyse anyhow. |
333 | 26 | bool Invalid = false; |
334 | 26 | const char *LocChars = |
335 | 26 | SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid); |
336 | 26 | assert(!Invalid && |
337 | 26 | "Loc must be a valid character and not the first of the source file."); |
338 | 26 | if (isVerticalWhitespace(LocChars[0])) { |
339 | 43 | for (int i = 1; isWhitespace(LocChars[i]); ++i26 ) |
340 | 28 | if (isVerticalWhitespace(LocChars[i])) |
341 | 2 | return true; |
342 | 17 | } |
343 | | // We didn't find an empty line, so lex the next token, skipping past any |
344 | | // whitespace we just scanned. |
345 | 24 | Token Tok; |
346 | 24 | bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts, |
347 | 24 | /*IgnoreWhiteSpace=*/true); |
348 | 24 | if (Failed) |
349 | | // Any text that confuses the lexer seems fair to consider a separation. |
350 | 0 | return true; |
351 | | |
352 | 24 | switch (Tok.getKind()) { |
353 | 0 | case tok::comment: |
354 | 1 | case tok::l_brace: |
355 | 2 | case tok::r_brace: |
356 | 11 | case tok::eof: |
357 | 11 | return true; |
358 | 13 | default: |
359 | 13 | return false; |
360 | 24 | } |
361 | 24 | } |
362 | | |
363 | | CharSourceRange tooling::getAssociatedRange(const Decl &Decl, |
364 | 42 | ASTContext &Context) { |
365 | 42 | const SourceManager &SM = Context.getSourceManager(); |
366 | 42 | const LangOptions &LangOpts = Context.getLangOpts(); |
367 | 42 | CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange()); |
368 | | |
369 | | // First, expand to the start of the template<> declaration if necessary. |
370 | 42 | if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) { |
371 | 6 | if (const auto *T = Record->getDescribedClassTemplate()) |
372 | 2 | if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
373 | 2 | Range.setBegin(T->getBeginLoc()); |
374 | 36 | } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) { |
375 | 5 | if (const auto *T = F->getDescribedFunctionTemplate()) |
376 | 3 | if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin())) |
377 | 2 | Range.setBegin(T->getBeginLoc()); |
378 | 5 | } |
379 | | |
380 | | // Next, expand the end location past trailing comments to include a potential |
381 | | // newline at the end of the decl's line. |
382 | 42 | Range.setEnd( |
383 | 42 | getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts)); |
384 | 42 | Range.setTokenRange(false); |
385 | | |
386 | | // Expand to include preceeding associated comments. We ignore any comments |
387 | | // that are not preceeding the decl, since we've already skipped trailing |
388 | | // comments with getEntityEndLoc. |
389 | 42 | if (const RawComment *Comment = |
390 | 42 | Decl.getASTContext().getRawCommentForDeclNoCache(&Decl)) |
391 | | // Only include a preceding comment if: |
392 | | // * it is *not* separate from the declaration (not including any newline |
393 | | // that immediately follows the comment), |
394 | | // * the decl *is* separate from any following entity (so, there are no |
395 | | // other entities the comment could refer to), and |
396 | | // * it is not a IfThisThenThat lint check. |
397 | 15 | if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(), |
398 | 15 | Range.getBegin()) && |
399 | 15 | !atOrBeforeSeparation( |
400 | 14 | SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts), |
401 | 14 | LangOpts) && |
402 | 15 | atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)12 ) { |
403 | 11 | const StringRef CommentText = Comment->getRawText(SM); |
404 | 11 | if (!CommentText.contains("LINT.IfChange") && |
405 | 11 | !CommentText.contains("LINT.ThenChange")10 ) |
406 | 9 | Range.setBegin(Comment->getBeginLoc()); |
407 | 11 | } |
408 | | // Add leading attributes. |
409 | 42 | for (auto *Attr : Decl.attrs()) { |
410 | 8 | if (Attr->getLocation().isInvalid() || |
411 | 8 | !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin())) |
412 | 8 | continue; |
413 | 0 | Range.setBegin(Attr->getLocation()); |
414 | | |
415 | | // Extend to the left '[[' or '__attribute((' if we saw the attribute, |
416 | | // unless it is not a valid location. |
417 | 0 | bool Invalid; |
418 | 0 | StringRef Source = |
419 | 0 | SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid); |
420 | 0 | if (Invalid) |
421 | 0 | continue; |
422 | 0 | llvm::StringRef BeforeAttr = |
423 | 0 | Source.substr(0, SM.getFileOffset(Range.getBegin())); |
424 | 0 | llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim(); |
425 | |
|
426 | 0 | for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) { |
427 | | // Handle whitespace between attribute prefix and attribute value. |
428 | 0 | if (BeforeAttrStripped.endswith(Prefix)) { |
429 | | // Move start to start position of prefix, which is |
430 | | // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix) |
431 | | // positions to the left. |
432 | 0 | Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>( |
433 | 0 | -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size()))); |
434 | 0 | break; |
435 | | // If we didn't see '[[' or '__attribute' it's probably coming from a |
436 | | // macro expansion which is already handled by makeFileCharRange(), |
437 | | // below. |
438 | 0 | } |
439 | 0 | } |
440 | 0 | } |
441 | | |
442 | | // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But, |
443 | | // Range.getBegin() may be inside an expansion. |
444 | 42 | return Lexer::makeFileCharRange(Range, SM, LangOpts); |
445 | 42 | } |