/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/DependencyDirectivesScanner.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===- DependencyDirectivesScanner.cpp ------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | /// |
9 | | /// \file |
10 | | /// This is the interface for scanning header and source files to get the |
11 | | /// minimum necessary preprocessor directives for evaluating includes. It |
12 | | /// reduces the source down to #define, #include, #import, @import, and any |
13 | | /// conditional preprocessor logic that contains one of those. |
14 | | /// |
15 | | //===----------------------------------------------------------------------===// |
16 | | |
17 | | #include "clang/Lex/DependencyDirectivesScanner.h" |
18 | | #include "clang/Basic/CharInfo.h" |
19 | | #include "clang/Basic/Diagnostic.h" |
20 | | #include "clang/Lex/LexDiagnostic.h" |
21 | | #include "clang/Lex/Lexer.h" |
22 | | #include "clang/Lex/Pragma.h" |
23 | | #include "llvm/ADT/ScopeExit.h" |
24 | | #include "llvm/ADT/SmallString.h" |
25 | | #include "llvm/ADT/StringMap.h" |
26 | | #include "llvm/ADT/StringSwitch.h" |
27 | | #include <optional> |
28 | | |
29 | | using namespace clang; |
30 | | using namespace clang::dependency_directives_scan; |
31 | | using namespace llvm; |
32 | | |
33 | | namespace { |
34 | | |
35 | | struct DirectiveWithTokens { |
36 | | DirectiveKind Kind; |
37 | | unsigned NumTokens; |
38 | | |
39 | | DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens) |
40 | 57.0k | : Kind(Kind), NumTokens(NumTokens) {} |
41 | | }; |
42 | | |
43 | | /// Does an efficient "scan" of the sources to detect the presence of |
44 | | /// preprocessor (or module import) directives and collects the raw lexed tokens |
45 | | /// for those directives so that the \p Lexer can "replay" them when the file is |
46 | | /// included. |
47 | | /// |
48 | | /// Note that the behavior of the raw lexer is affected by the language mode, |
49 | | /// while at this point we want to do a scan and collect tokens once, |
50 | | /// irrespective of the language mode that the file will get included in. To |
51 | | /// compensate for that the \p Lexer, while "replaying", will adjust a token |
52 | | /// where appropriate, when it could affect the preprocessor's state. |
53 | | /// For example in a directive like |
54 | | /// |
55 | | /// \code |
56 | | /// #if __has_cpp_attribute(clang::fallthrough) |
57 | | /// \endcode |
58 | | /// |
59 | | /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2 |
60 | | /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon' |
61 | | /// while in C++ mode. |
62 | | struct Scanner { |
63 | | Scanner(StringRef Input, |
64 | | SmallVectorImpl<dependency_directives_scan::Token> &Tokens, |
65 | | DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) |
66 | 1.32k | : Input(Input), Tokens(Tokens), Diags(Diags), |
67 | 1.32k | InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()), |
68 | 1.32k | TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(), |
69 | 1.32k | Input.end()) {} |
70 | | |
71 | 1.32k | static LangOptions getLangOptsForDepScanning() { |
72 | 1.32k | LangOptions LangOpts; |
73 | | // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'. |
74 | 1.32k | LangOpts.ObjC = true; |
75 | 1.32k | LangOpts.LineComment = true; |
76 | | // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and |
77 | | // R"()" literals. |
78 | 1.32k | return LangOpts; |
79 | 1.32k | } |
80 | | |
81 | | /// Lex the provided source and emit the directive tokens. |
82 | | /// |
83 | | /// \returns True on error. |
84 | | bool scan(SmallVectorImpl<Directive> &Directives); |
85 | | |
86 | | private: |
87 | | /// Lexes next token and advances \p First and the \p Lexer. |
88 | | [[nodiscard]] dependency_directives_scan::Token & |
89 | | lexToken(const char *&First, const char *const End); |
90 | | |
91 | | dependency_directives_scan::Token &lexIncludeFilename(const char *&First, |
92 | | const char *const End); |
93 | | |
94 | | void skipLine(const char *&First, const char *const End); |
95 | | void skipDirective(StringRef Name, const char *&First, const char *const End); |
96 | | |
97 | | /// Returns the spelling of a string literal or identifier after performing |
98 | | /// any processing needed to handle \c clang::Token::NeedsCleaning. |
99 | | StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok); |
100 | | |
101 | | /// Lexes next token and if it is identifier returns its string, otherwise |
102 | | /// it skips the current line and returns \p std::nullopt. |
103 | | /// |
104 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
105 | | /// advance beyond the token. |
106 | | [[nodiscard]] std::optional<StringRef> |
107 | | tryLexIdentifierOrSkipLine(const char *&First, const char *const End); |
108 | | |
109 | | /// Used when it is certain that next token is an identifier. |
110 | | [[nodiscard]] StringRef lexIdentifier(const char *&First, |
111 | | const char *const End); |
112 | | |
113 | | /// Lexes next token and returns true iff it is an identifier that matches \p |
114 | | /// Id, otherwise it skips the current line and returns false. |
115 | | /// |
116 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
117 | | /// advance beyond the token. |
118 | | [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id, |
119 | | const char *&First, |
120 | | const char *const End); |
121 | | |
122 | | /// Lexes next token and returns true iff it matches the kind \p K. |
123 | | /// Otherwise it skips the current line and returns false. |
124 | | /// |
125 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
126 | | /// advance beyond the token. |
127 | | [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First, |
128 | | const char *const End); |
129 | | |
130 | | /// Lexes next token and if it is string literal, returns its string. |
131 | | /// Otherwise, it skips the current line and returns \p std::nullopt. |
132 | | /// |
133 | | /// In any case (whatever the token kind) \p First and the \p Lexer will |
134 | | /// advance beyond the token. |
135 | | [[nodiscard]] std::optional<StringRef> |
136 | | tryLexStringLiteralOrSkipLine(const char *&First, const char *const End); |
137 | | |
138 | | [[nodiscard]] bool scanImpl(const char *First, const char *const End); |
139 | | [[nodiscard]] bool lexPPLine(const char *&First, const char *const End); |
140 | | [[nodiscard]] bool lexAt(const char *&First, const char *const End); |
141 | | [[nodiscard]] bool lexModule(const char *&First, const char *const End); |
142 | | [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First, |
143 | | const char *const End); |
144 | | [[nodiscard]] bool lexPragma(const char *&First, const char *const End); |
145 | | [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End); |
146 | | [[nodiscard]] bool lexEndif(const char *&First, const char *const End); |
147 | | [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First, |
148 | | const char *const End); |
149 | | [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind, |
150 | | const char *&First, |
151 | | const char *const End); |
152 | | void lexPPDirectiveBody(const char *&First, const char *const End); |
153 | | |
154 | 57.0k | DirectiveWithTokens &pushDirective(DirectiveKind Kind) { |
155 | 57.0k | Tokens.append(CurDirToks); |
156 | 57.0k | DirsWithToks.emplace_back(Kind, CurDirToks.size()); |
157 | 57.0k | CurDirToks.clear(); |
158 | 57.0k | return DirsWithToks.back(); |
159 | 57.0k | } |
160 | 3.60k | void popDirective() { |
161 | 3.60k | Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens); |
162 | 3.60k | } |
163 | 24.5k | DirectiveKind topDirective() const { |
164 | 24.5k | return DirsWithToks.empty() ? pp_none28 : DirsWithToks.back().Kind24.4k ; |
165 | 24.5k | } |
166 | | |
167 | 76.7k | unsigned getOffsetAt(const char *CurPtr) const { |
168 | 76.7k | return CurPtr - Input.data(); |
169 | 76.7k | } |
170 | | |
171 | | /// Reports a diagnostic if the diagnostic engine is provided. Always returns |
172 | | /// true at the end. |
173 | | bool reportError(const char *CurPtr, unsigned Err); |
174 | | |
175 | | StringMap<char> SplitIds; |
176 | | StringRef Input; |
177 | | SmallVectorImpl<dependency_directives_scan::Token> &Tokens; |
178 | | DiagnosticsEngine *Diags; |
179 | | SourceLocation InputSourceLoc; |
180 | | |
181 | | const char *LastTokenPtr = nullptr; |
182 | | /// Keeps track of the tokens for the currently lexed directive. Once a |
183 | | /// directive is fully lexed and "committed" then the tokens get appended to |
184 | | /// \p Tokens and \p CurDirToks is cleared for the next directive. |
185 | | SmallVector<dependency_directives_scan::Token, 32> CurDirToks; |
186 | | /// The directives that were lexed along with the number of tokens that each |
187 | | /// directive contains. The tokens of all the directives are kept in \p Tokens |
188 | | /// vector, in the same order as the directives order in \p DirsWithToks. |
189 | | SmallVector<DirectiveWithTokens, 64> DirsWithToks; |
190 | | LangOptions LangOpts; |
191 | | Lexer TheLexer; |
192 | | }; |
193 | | |
194 | | } // end anonymous namespace |
195 | | |
196 | 3 | bool Scanner::reportError(const char *CurPtr, unsigned Err) { |
197 | 3 | if (!Diags) |
198 | 1 | return true; |
199 | 2 | assert(CurPtr >= Input.data() && "invalid buffer ptr"); |
200 | 2 | Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err); |
201 | 2 | return true; |
202 | 2 | } |
203 | | |
204 | 159k | static void skipOverSpaces(const char *&First, const char *const End) { |
205 | 362k | while (First != End && isHorizontalWhitespace(*First)362k ) |
206 | 203k | ++First; |
207 | 159k | } |
208 | | |
209 | | [[nodiscard]] static bool isRawStringLiteral(const char *First, |
210 | 2.28k | const char *Current) { |
211 | 2.28k | assert(First <= Current); |
212 | | |
213 | | // Check if we can even back up. |
214 | 2.28k | if (*Current != '"' || First == Current2.27k ) |
215 | 15 | return false; |
216 | | |
217 | | // Check for an "R". |
218 | 2.26k | --Current; |
219 | 2.26k | if (*Current != 'R') |
220 | 2.26k | return false; |
221 | 3 | if (First == Current || !isAsciiIdentifierContinue(*--Current)2 ) |
222 | 3 | return true; |
223 | | |
224 | | // Check for a prefix of "u", "U", or "L". |
225 | 0 | if (*Current == 'u' || *Current == 'U' || *Current == 'L') |
226 | 0 | return First == Current || !isAsciiIdentifierContinue(*--Current); |
227 | | |
228 | | // Check for a prefix of "u8". |
229 | 0 | if (*Current != '8' || First == Current || *Current-- != 'u') |
230 | 0 | return false; |
231 | 0 | return First == Current || !isAsciiIdentifierContinue(*--Current); |
232 | 0 | } |
233 | | |
234 | 3 | static void skipRawString(const char *&First, const char *const End) { |
235 | 3 | assert(First[0] == '"'); |
236 | 3 | assert(First[-1] == 'R'); |
237 | | |
238 | 3 | const char *Last = ++First; |
239 | 6 | while (Last != End && *Last != '(') |
240 | 3 | ++Last; |
241 | 3 | if (Last == End) { |
242 | 0 | First = Last; // Hit the end... just give up. |
243 | 0 | return; |
244 | 0 | } |
245 | | |
246 | 3 | StringRef Terminator(First, Last - First); |
247 | 3 | for (;;) { |
248 | | // Move First to just past the next ")". |
249 | 3 | First = Last; |
250 | 46 | while (First != End && *First != ')') |
251 | 43 | ++First; |
252 | 3 | if (First == End) |
253 | 0 | return; |
254 | 3 | ++First; |
255 | | |
256 | | // Look ahead for the terminator sequence. |
257 | 3 | Last = First; |
258 | 6 | while (Last != End && size_t(Last - First) < Terminator.size() && |
259 | 6 | Terminator[Last - First] == *Last3 ) |
260 | 3 | ++Last; |
261 | | |
262 | | // Check if we hit it (or the end of the file). |
263 | 3 | if (Last == End) { |
264 | 0 | First = Last; |
265 | 0 | return; |
266 | 0 | } |
267 | 3 | if (size_t(Last - First) < Terminator.size()) |
268 | 0 | continue; |
269 | 3 | if (*Last != '"') |
270 | 0 | continue; |
271 | 3 | First = Last + 1; |
272 | 3 | return; |
273 | 3 | } |
274 | 3 | } |
275 | | |
276 | | // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n) |
277 | 254k | static unsigned isEOL(const char *First, const char *const End) { |
278 | 254k | if (First == End) |
279 | 0 | return 0; |
280 | 254k | if (End - First > 1 && isVerticalWhitespace(First[0])254k && |
281 | 254k | isVerticalWhitespace(First[1])95.6k && First[0] != First[1]13.8k ) |
282 | 10 | return 2; |
283 | 254k | return !!isVerticalWhitespace(First[0]); |
284 | 254k | } |
285 | | |
286 | 2.27k | static void skipString(const char *&First, const char *const End) { |
287 | 2.27k | assert(*First == '\'' || *First == '"' || *First == '<'); |
288 | 2.27k | const char Terminator = *First == '<' ? '>'0 : *First; |
289 | 49.2k | for (++First; First != End && *First != Terminator; ++First46.9k ) { |
290 | | // String and character literals don't extend past the end of the line. |
291 | 46.9k | if (isVerticalWhitespace(*First)) |
292 | 0 | return; |
293 | 46.9k | if (*First != '\\') |
294 | 46.9k | continue; |
295 | | // Skip past backslash to the next character. This ensures that the |
296 | | // character right after it is skipped as well, which matters if it's |
297 | | // the terminator. |
298 | 2 | if (++First == End) |
299 | 0 | return; |
300 | 2 | if (!isWhitespace(*First)) |
301 | 2 | continue; |
302 | | // Whitespace after the backslash might indicate a line continuation. |
303 | 0 | const char *FirstAfterBackslashPastSpace = First; |
304 | 0 | skipOverSpaces(FirstAfterBackslashPastSpace, End); |
305 | 0 | if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) { |
306 | | // Advance the character pointer to the next line for the next |
307 | | // iteration. |
308 | 0 | First = FirstAfterBackslashPastSpace + NLSize - 1; |
309 | 0 | } |
310 | 0 | } |
311 | 2.27k | if (First != End) |
312 | 2.27k | ++First; // Finish off the string. |
313 | 2.27k | } |
314 | | |
315 | | // Returns the length of the skipped newline |
316 | 92.0k | static unsigned skipNewline(const char *&First, const char *End) { |
317 | 92.0k | if (First == End) |
318 | 0 | return 0; |
319 | 92.0k | assert(isVerticalWhitespace(*First)); |
320 | 92.0k | unsigned Len = isEOL(First, End); |
321 | 92.0k | assert(Len && "expected newline"); |
322 | 92.0k | First += Len; |
323 | 92.0k | return Len; |
324 | 92.0k | } |
325 | | |
326 | 48.8k | static bool wasLineContinuation(const char *First, unsigned EOLLen) { |
327 | 48.8k | return *(First - (int)EOLLen - 1) == '\\'; |
328 | 48.8k | } |
329 | | |
330 | 3.98k | static void skipToNewlineRaw(const char *&First, const char *const End) { |
331 | 4.24k | for (;;) { |
332 | 4.24k | if (First == End) |
333 | 0 | return; |
334 | | |
335 | 4.24k | unsigned Len = isEOL(First, End); |
336 | 4.24k | if (Len) |
337 | 110 | return; |
338 | | |
339 | 161k | do 4.13k { |
340 | 161k | if (++First == End) |
341 | 0 | return; |
342 | 161k | Len = isEOL(First, End); |
343 | 161k | } while (!Len); |
344 | | |
345 | 4.13k | if (First[-1] != '\\') |
346 | 3.82k | return; |
347 | | |
348 | 309 | First += Len; |
349 | | // Keep skipping lines... |
350 | 309 | } |
351 | 3.98k | } |
352 | | |
353 | 3.84k | static void skipLineComment(const char *&First, const char *const End) { |
354 | 3.84k | assert(First[0] == '/' && First[1] == '/'); |
355 | 3.84k | First += 2; |
356 | 3.84k | skipToNewlineRaw(First, End); |
357 | 3.84k | } |
358 | | |
359 | 17.1k | static void skipBlockComment(const char *&First, const char *const End) { |
360 | 17.1k | assert(First[0] == '/' && First[1] == '*'); |
361 | 17.1k | if (End - First < 4) { |
362 | 0 | First = End; |
363 | 0 | return; |
364 | 0 | } |
365 | 4.10M | for (First += 3; 17.1k First != End; ++First4.08M ) |
366 | 4.10M | if (First[-1] == '*' && First[0] == '/'130k ) { |
367 | 17.1k | ++First; |
368 | 17.1k | return; |
369 | 17.1k | } |
370 | 17.1k | } |
371 | | |
372 | | /// \returns True if the current single quotation mark character is a C++ 14 |
373 | | /// digit separator. |
374 | | static bool isQuoteCppDigitSeparator(const char *const Start, |
375 | | const char *const Cur, |
376 | 11 | const char *const End) { |
377 | 11 | assert(*Cur == '\'' && "expected quotation character"); |
378 | | // skipLine called in places where we don't expect a valid number |
379 | | // body before `start` on the same line, so always return false at the start. |
380 | 11 | if (Start == Cur) |
381 | 0 | return false; |
382 | | // The previous character must be a valid PP number character. |
383 | | // Make sure that the L, u, U, u8 prefixes don't get marked as a |
384 | | // separator though. |
385 | 11 | char Prev = *(Cur - 1); |
386 | 11 | if (Prev == 'L' || Prev == 'U'10 || Prev == 'u'9 ) |
387 | 3 | return false; |
388 | 8 | if (Prev == '8' && (Cur - 1 != Start)2 && *(Cur - 2) == 'u'2 ) |
389 | 1 | return false; |
390 | 7 | if (!isPreprocessingNumberBody(Prev)) |
391 | 3 | return false; |
392 | | // The next character should be a valid identifier body character. |
393 | 4 | return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1)); |
394 | 7 | } |
395 | | |
396 | 92.0k | void Scanner::skipLine(const char *&First, const char *const End) { |
397 | 92.0k | for (;;) { |
398 | 92.0k | assert(First <= End); |
399 | 92.0k | if (First == End) |
400 | 9 | return; |
401 | | |
402 | 92.0k | if (isVerticalWhitespace(*First)) { |
403 | 43.1k | skipNewline(First, End); |
404 | 43.1k | return; |
405 | 43.1k | } |
406 | 48.8k | const char *Start = First; |
407 | 1.54M | while (First != End && !isVerticalWhitespace(*First)1.54M ) { |
408 | | // Iterate over strings correctly to avoid comments and newlines. |
409 | 1.49M | if (*First == '"' || |
410 | 1.49M | (1.48M *First == '\''1.48M && !isQuoteCppDigitSeparator(Start, First, End)11 )) { |
411 | 2.28k | LastTokenPtr = First; |
412 | 2.28k | if (isRawStringLiteral(Start, First)) |
413 | 3 | skipRawString(First, End); |
414 | 2.27k | else |
415 | 2.27k | skipString(First, End); |
416 | 2.28k | continue; |
417 | 2.28k | } |
418 | | |
419 | | // Iterate over comments correctly. |
420 | 1.48M | if (*First != '/' || End - First < 26.16k ) { |
421 | 1.48M | LastTokenPtr = First; |
422 | 1.48M | ++First; |
423 | 1.48M | continue; |
424 | 1.48M | } |
425 | | |
426 | 6.16k | if (First[1] == '/') { |
427 | | // "//...". |
428 | 299 | skipLineComment(First, End); |
429 | 299 | continue; |
430 | 299 | } |
431 | | |
432 | 5.87k | if (First[1] != '*') { |
433 | 6 | LastTokenPtr = First; |
434 | 6 | ++First; |
435 | 6 | continue; |
436 | 6 | } |
437 | | |
438 | | // "/*...*/". |
439 | 5.86k | skipBlockComment(First, End); |
440 | 5.86k | } |
441 | 48.8k | if (First == End) |
442 | 6 | return; |
443 | | |
444 | | // Skip over the newline. |
445 | 48.8k | unsigned Len = skipNewline(First, End); |
446 | 48.8k | if (!wasLineContinuation(First, Len)) // Continue past line-continuations. |
447 | 48.8k | break; |
448 | 48.8k | } |
449 | 92.0k | } |
450 | | |
451 | | void Scanner::skipDirective(StringRef Name, const char *&First, |
452 | 143 | const char *const End) { |
453 | 143 | if (llvm::StringSwitch<bool>(Name) |
454 | 143 | .Case("warning", true) |
455 | 143 | .Case("error", true) |
456 | 143 | .Default(false)) |
457 | | // Do not process quotes or comments. |
458 | 141 | skipToNewlineRaw(First, End); |
459 | 2 | else |
460 | 2 | skipLine(First, End); |
461 | 143 | } |
462 | | |
463 | 147k | static void skipWhitespace(const char *&First, const char *const End) { |
464 | 159k | for (;;) { |
465 | 159k | assert(First <= End); |
466 | 159k | skipOverSpaces(First, End); |
467 | | |
468 | 159k | if (End - First < 2) |
469 | 200 | return; |
470 | | |
471 | 159k | if (First[0] == '\\' && isVerticalWhitespace(First[1])1 ) { |
472 | 1 | skipNewline(++First, End); |
473 | 1 | continue; |
474 | 1 | } |
475 | | |
476 | | // Check for a non-comment character. |
477 | 159k | if (First[0] != '/') |
478 | 144k | return; |
479 | | |
480 | | // "// ...". |
481 | 14.8k | if (First[1] == '/') { |
482 | 3.54k | skipLineComment(First, End); |
483 | 3.54k | return; |
484 | 3.54k | } |
485 | | |
486 | | // Cannot be a comment. |
487 | 11.2k | if (First[1] != '*') |
488 | 0 | return; |
489 | | |
490 | | // "/*...*/". |
491 | 11.2k | skipBlockComment(First, End); |
492 | 11.2k | } |
493 | 147k | } |
494 | | |
495 | | bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First, |
496 | 49 | const char *const End) { |
497 | 49 | const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset; |
498 | 137 | for (;;) { |
499 | 137 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
500 | 137 | if (Tok.is(tok::eof)) |
501 | 2 | return reportError( |
502 | 2 | DirectiveLoc, |
503 | 2 | diag::err_dep_source_scanner_missing_semi_after_at_import); |
504 | 135 | if (Tok.is(tok::semi)) |
505 | 46 | break; |
506 | 135 | } |
507 | 47 | pushDirective(Kind); |
508 | 47 | skipWhitespace(First, End); |
509 | 47 | if (First == End) |
510 | 2 | return false; |
511 | 45 | if (!isVerticalWhitespace(*First)) |
512 | 1 | return reportError( |
513 | 1 | DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); |
514 | 44 | skipNewline(First, End); |
515 | 44 | return false; |
516 | 45 | } |
517 | | |
518 | | dependency_directives_scan::Token &Scanner::lexToken(const char *&First, |
519 | 472k | const char *const End) { |
520 | 472k | clang::Token Tok; |
521 | 472k | TheLexer.LexFromRawLexer(Tok); |
522 | 472k | First = Input.data() + TheLexer.getCurrentBufferOffset(); |
523 | 472k | assert(First <= End); |
524 | | |
525 | 472k | unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); |
526 | 472k | CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), |
527 | 472k | Tok.getFlags()); |
528 | 472k | return CurDirToks.back(); |
529 | 472k | } |
530 | | |
531 | | dependency_directives_scan::Token & |
532 | 3.52k | Scanner::lexIncludeFilename(const char *&First, const char *const End) { |
533 | 3.52k | clang::Token Tok; |
534 | 3.52k | TheLexer.LexIncludeFilename(Tok); |
535 | 3.52k | First = Input.data() + TheLexer.getCurrentBufferOffset(); |
536 | 3.52k | assert(First <= End); |
537 | | |
538 | 3.52k | unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); |
539 | 3.52k | CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), |
540 | 3.52k | Tok.getFlags()); |
541 | 3.52k | return CurDirToks.back(); |
542 | 3.52k | } |
543 | | |
544 | 55.6k | void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) { |
545 | 332k | while (true332k ) { |
546 | 332k | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
547 | 332k | if (Tok.is(tok::eod)) |
548 | 55.6k | break; |
549 | 332k | } |
550 | 55.6k | } |
551 | | |
552 | | StringRef |
553 | 79.0k | Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) { |
554 | 79.0k | bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning; |
555 | 79.0k | if (LLVM_LIKELY(!NeedsCleaning)) |
556 | 79.0k | return Input.slice(Tok.Offset, Tok.getEnd()); |
557 | | |
558 | 3 | SmallString<64> Spelling; |
559 | 3 | Spelling.resize(Tok.Length); |
560 | | |
561 | | // FIXME: C++11 raw string literals need special handling (see getSpellingSlow |
562 | | // in the Lexer). Currently we cannot see them due to our LangOpts. |
563 | | |
564 | 3 | unsigned SpellingLength = 0; |
565 | 3 | const char *BufPtr = Input.begin() + Tok.Offset; |
566 | 3 | const char *AfterIdent = Input.begin() + Tok.getEnd(); |
567 | 40 | while (BufPtr < AfterIdent) { |
568 | 37 | unsigned Size; |
569 | 37 | Spelling[SpellingLength++] = |
570 | 37 | Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); |
571 | 37 | BufPtr += Size; |
572 | 37 | } |
573 | | |
574 | 3 | return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0) |
575 | 3 | .first->first(); |
576 | 79.0k | } |
577 | | |
578 | | std::optional<StringRef> |
579 | 79.0k | Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) { |
580 | 79.0k | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
581 | 79.0k | if (Tok.isNot(tok::raw_identifier)) { |
582 | 5 | if (!Tok.is(tok::eod)) |
583 | 0 | skipLine(First, End); |
584 | 5 | return std::nullopt; |
585 | 5 | } |
586 | | |
587 | 79.0k | return cleanStringIfNeeded(Tok); |
588 | 79.0k | } |
589 | | |
590 | 11.3k | StringRef Scanner::lexIdentifier(const char *&First, const char *const End) { |
591 | 11.3k | std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End); |
592 | 11.3k | assert(Id && "expected identifier token"); |
593 | 11.3k | return *Id; |
594 | 11.3k | } |
595 | | |
596 | | bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First, |
597 | 4.44k | const char *const End) { |
598 | 4.44k | if (std::optional<StringRef> FoundId = |
599 | 4.44k | tryLexIdentifierOrSkipLine(First, End)) { |
600 | 4.44k | if (*FoundId == Id) |
601 | 78 | return true; |
602 | 4.36k | skipLine(First, End); |
603 | 4.36k | } |
604 | 4.37k | return false; |
605 | 4.44k | } |
606 | | |
607 | | bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First, |
608 | 82 | const char *const End) { |
609 | 82 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
610 | 82 | if (Tok.is(K)) |
611 | 81 | return true; |
612 | 1 | skipLine(First, End); |
613 | 1 | return false; |
614 | 82 | } |
615 | | |
616 | | std::optional<StringRef> |
617 | | Scanner::tryLexStringLiteralOrSkipLine(const char *&First, |
618 | 43 | const char *const End) { |
619 | 43 | const dependency_directives_scan::Token &Tok = lexToken(First, End); |
620 | 43 | if (!tok::isStringLiteral(Tok.Kind)) { |
621 | 5 | if (!Tok.is(tok::eod)) |
622 | 5 | skipLine(First, End); |
623 | 5 | return std::nullopt; |
624 | 5 | } |
625 | | |
626 | 38 | return cleanStringIfNeeded(Tok); |
627 | 43 | } |
628 | | |
629 | 28 | bool Scanner::lexAt(const char *&First, const char *const End) { |
630 | | // Handle "@import". |
631 | | |
632 | | // Lex '@'. |
633 | 28 | const dependency_directives_scan::Token &AtTok = lexToken(First, End); |
634 | 28 | assert(AtTok.is(tok::at)); |
635 | 28 | (void)AtTok; |
636 | | |
637 | 28 | if (!isNextIdentifierOrSkipLine("import", First, End)) |
638 | 0 | return false; |
639 | 28 | return lexModuleDirectiveBody(decl_at_import, First, End); |
640 | 28 | } |
641 | | |
642 | 11.3k | bool Scanner::lexModule(const char *&First, const char *const End) { |
643 | 11.3k | StringRef Id = lexIdentifier(First, End); |
644 | 11.3k | bool Export = false; |
645 | 11.3k | if (Id == "export") { |
646 | 4 | Export = true; |
647 | 4 | std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End); |
648 | 4 | if (!NextId) |
649 | 0 | return false; |
650 | 4 | Id = *NextId; |
651 | 4 | } |
652 | | |
653 | 11.3k | if (Id != "module" && Id != "import"11.3k ) { |
654 | 11.2k | skipLine(First, End); |
655 | 11.2k | return false; |
656 | 11.2k | } |
657 | | |
658 | 30 | skipWhitespace(First, End); |
659 | | |
660 | | // Ignore this as a module directive if the next character can't be part of |
661 | | // an import. |
662 | | |
663 | 30 | switch (*First) { |
664 | 1 | case ':': |
665 | 2 | case '<': |
666 | 2 | case '"': |
667 | 2 | break; |
668 | 28 | default: |
669 | 28 | if (!isAsciiIdentifierContinue(*First)) { |
670 | 9 | skipLine(First, End); |
671 | 9 | return false; |
672 | 9 | } |
673 | 30 | } |
674 | | |
675 | 21 | TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false); |
676 | | |
677 | 21 | DirectiveKind Kind; |
678 | 21 | if (Id == "module") |
679 | 5 | Kind = Export ? cxx_export_module_decl2 : cxx_module_decl3 ; |
680 | 16 | else |
681 | 16 | Kind = Export ? cxx_export_import_decl1 : cxx_import_decl15 ; |
682 | | |
683 | 21 | return lexModuleDirectiveBody(Kind, First, End); |
684 | 30 | } |
685 | | |
686 | 44 | bool Scanner::lex_Pragma(const char *&First, const char *const End) { |
687 | 44 | if (!isNextTokenOrSkipLine(tok::l_paren, First, End)) |
688 | 1 | return false; |
689 | | |
690 | 43 | std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End); |
691 | | |
692 | 43 | if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End)38 ) |
693 | 5 | return false; |
694 | | |
695 | 38 | SmallString<64> Buffer(*Str); |
696 | 38 | prepare_PragmaString(Buffer); |
697 | | |
698 | | // Use a new scanner instance since the tokens will be inside the allocated |
699 | | // string. We should already have captured all the relevant tokens in the |
700 | | // current scanner. |
701 | 38 | SmallVector<dependency_directives_scan::Token> DiscardTokens; |
702 | 38 | const char *Begin = Buffer.c_str(); |
703 | 38 | Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags, |
704 | 38 | InputSourceLoc}; |
705 | | |
706 | 38 | PragmaScanner.TheLexer.setParsingPreprocessorDirective(true); |
707 | 38 | if (PragmaScanner.lexPragma(Begin, Buffer.end())) |
708 | 0 | return true; |
709 | | |
710 | 38 | DirectiveKind K = PragmaScanner.topDirective(); |
711 | 38 | if (K == pp_none) { |
712 | 28 | skipLine(First, End); |
713 | 28 | return false; |
714 | 28 | } |
715 | | |
716 | 10 | assert(Begin == Buffer.end()); |
717 | 10 | pushDirective(K); |
718 | 10 | return false; |
719 | 10 | } |
720 | | |
721 | 2.30k | bool Scanner::lexPragma(const char *&First, const char *const End) { |
722 | 2.30k | std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); |
723 | 2.30k | if (!FoundId) |
724 | 0 | return false; |
725 | | |
726 | 2.30k | StringRef Id = *FoundId; |
727 | 2.30k | auto Kind = llvm::StringSwitch<DirectiveKind>(Id) |
728 | 2.30k | .Case("once", pp_pragma_once) |
729 | 2.30k | .Case("push_macro", pp_pragma_push_macro) |
730 | 2.30k | .Case("pop_macro", pp_pragma_pop_macro) |
731 | 2.30k | .Case("include_alias", pp_pragma_include_alias) |
732 | 2.30k | .Default(pp_none); |
733 | 2.30k | if (Kind != pp_none) { |
734 | 37 | lexPPDirectiveBody(First, End); |
735 | 37 | pushDirective(Kind); |
736 | 37 | return false; |
737 | 37 | } |
738 | | |
739 | 2.26k | if (Id != "clang") { |
740 | 2.22k | skipLine(First, End); |
741 | 2.22k | return false; |
742 | 2.22k | } |
743 | | |
744 | 40 | FoundId = tryLexIdentifierOrSkipLine(First, End); |
745 | 40 | if (!FoundId) |
746 | 2 | return false; |
747 | 38 | Id = *FoundId; |
748 | | |
749 | | // #pragma clang system_header |
750 | 38 | if (Id == "system_header") { |
751 | 1 | lexPPDirectiveBody(First, End); |
752 | 1 | pushDirective(pp_pragma_system_header); |
753 | 1 | return false; |
754 | 1 | } |
755 | | |
756 | 37 | if (Id != "module") { |
757 | 27 | skipLine(First, End); |
758 | 27 | return false; |
759 | 27 | } |
760 | | |
761 | | // #pragma clang module. |
762 | 10 | if (!isNextIdentifierOrSkipLine("import", First, End)) |
763 | 4 | return false; |
764 | | |
765 | | // #pragma clang module import. |
766 | 6 | lexPPDirectiveBody(First, End); |
767 | 6 | pushDirective(pp_pragma_import); |
768 | 6 | return false; |
769 | 10 | } |
770 | | |
771 | 9.09k | bool Scanner::lexEndif(const char *&First, const char *const End) { |
772 | | // Strip out "#else" if it's empty. |
773 | 9.09k | if (topDirective() == pp_else) |
774 | 727 | popDirective(); |
775 | | |
776 | | // If "#ifdef" is empty, strip it and skip the "#endif". |
777 | | // |
778 | | // FIXME: Once/if Clang starts disallowing __has_include in macro expansions, |
779 | | // we can skip empty `#if` and `#elif` blocks as well after scanning for a |
780 | | // literal __has_include in the condition. Even without that rule we could |
781 | | // drop the tokens if we scan for identifiers in the condition and find none. |
782 | 9.09k | if (topDirective() == pp_ifdef || topDirective() == pp_ifndef6.27k ) { |
783 | 2.87k | popDirective(); |
784 | 2.87k | skipLine(First, End); |
785 | 2.87k | return false; |
786 | 2.87k | } |
787 | | |
788 | 6.22k | return lexDefault(pp_endif, First, End); |
789 | 9.09k | } |
790 | | |
791 | | bool Scanner::lexDefault(DirectiveKind Kind, const char *&First, |
792 | 55.6k | const char *const End) { |
793 | 55.6k | lexPPDirectiveBody(First, End); |
794 | 55.6k | pushDirective(Kind); |
795 | 55.6k | return false; |
796 | 55.6k | } |
797 | | |
798 | 147k | static bool isStartOfRelevantLine(char First) { |
799 | 147k | switch (First) { |
800 | 60.9k | case '#': |
801 | 60.9k | case '@': |
802 | 65.2k | case 'i': |
803 | 68.2k | case 'e': |
804 | 72.2k | case 'm': |
805 | 76.6k | case '_': |
806 | 76.6k | return true; |
807 | 147k | } |
808 | 71.1k | return false; |
809 | 147k | } |
810 | | |
811 | 147k | bool Scanner::lexPPLine(const char *&First, const char *const End) { |
812 | 147k | assert(First != End); |
813 | | |
814 | 147k | skipWhitespace(First, End); |
815 | 147k | assert(First <= End); |
816 | 147k | if (First == End) |
817 | 8 | return false; |
818 | | |
819 | 147k | if (!isStartOfRelevantLine(*First)) { |
820 | 71.1k | skipLine(First, End); |
821 | 71.1k | assert(First <= End); |
822 | 71.1k | return false; |
823 | 71.1k | } |
824 | | |
825 | 76.6k | LastTokenPtr = First; |
826 | | |
827 | 76.6k | TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true); |
828 | | |
829 | 76.6k | auto ScEx1 = make_scope_exit([&]() { |
830 | | /// Clear Scanner's CurDirToks before returning, in case we didn't push a |
831 | | /// new directive. |
832 | 76.6k | CurDirToks.clear(); |
833 | 76.6k | }); |
834 | | |
835 | | // Handle "@import". |
836 | 76.6k | if (*First == '@') |
837 | 28 | return lexAt(First, End); |
838 | | |
839 | 76.6k | if (*First == 'i' || *First == 'e'72.3k || *First == 'm'69.3k ) |
840 | 11.3k | return lexModule(First, End); |
841 | | |
842 | 65.3k | if (*First == '_') { |
843 | 4.41k | if (isNextIdentifierOrSkipLine("_Pragma", First, End)) |
844 | 44 | return lex_Pragma(First, End); |
845 | 4.36k | return false; |
846 | 4.41k | } |
847 | | |
848 | | // Handle preprocessing directives. |
849 | | |
850 | 60.9k | TheLexer.setParsingPreprocessorDirective(true); |
851 | 60.9k | auto ScEx2 = make_scope_exit( |
852 | 60.9k | [&]() { TheLexer.setParsingPreprocessorDirective(false); }60.9k ); |
853 | | |
854 | | // Lex '#'. |
855 | 60.9k | const dependency_directives_scan::Token &HashTok = lexToken(First, End); |
856 | 60.9k | if (HashTok.is(tok::hashhash)) { |
857 | | // A \p tok::hashhash at this location is passed by the preprocessor to the |
858 | | // parser to interpret, like any other token. So for dependency scanning |
859 | | // skip it like a normal token not affecting the preprocessor. |
860 | 2 | skipLine(First, End); |
861 | 2 | assert(First <= End); |
862 | 2 | return false; |
863 | 2 | } |
864 | 60.9k | assert(HashTok.is(tok::hash)); |
865 | 60.9k | (void)HashTok; |
866 | | |
867 | 60.9k | std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); |
868 | 60.9k | if (!FoundId) |
869 | 1 | return false; |
870 | | |
871 | 60.9k | StringRef Id = *FoundId; |
872 | | |
873 | 60.9k | if (Id == "pragma") |
874 | 2.26k | return lexPragma(First, End); |
875 | | |
876 | 58.6k | auto Kind = llvm::StringSwitch<DirectiveKind>(Id) |
877 | 58.6k | .Case("include", pp_include) |
878 | 58.6k | .Case("__include_macros", pp___include_macros) |
879 | 58.6k | .Case("define", pp_define) |
880 | 58.6k | .Case("undef", pp_undef) |
881 | 58.6k | .Case("import", pp_import) |
882 | 58.6k | .Case("include_next", pp_include_next) |
883 | 58.6k | .Case("if", pp_if) |
884 | 58.6k | .Case("ifdef", pp_ifdef) |
885 | 58.6k | .Case("ifndef", pp_ifndef) |
886 | 58.6k | .Case("elif", pp_elif) |
887 | 58.6k | .Case("elifdef", pp_elifdef) |
888 | 58.6k | .Case("elifndef", pp_elifndef) |
889 | 58.6k | .Case("else", pp_else) |
890 | 58.6k | .Case("endif", pp_endif) |
891 | 58.6k | .Default(pp_none); |
892 | 58.6k | if (Kind == pp_none) { |
893 | 143 | skipDirective(Id, First, End); |
894 | 143 | return false; |
895 | 143 | } |
896 | | |
897 | 58.5k | if (Kind == pp_endif) |
898 | 9.09k | return lexEndif(First, End); |
899 | | |
900 | 49.4k | switch (Kind) { |
901 | 3.50k | case pp_include: |
902 | 3.50k | case pp___include_macros: |
903 | 3.50k | case pp_include_next: |
904 | 3.53k | case pp_import: |
905 | 3.53k | lexIncludeFilename(First, End); |
906 | 3.53k | break; |
907 | 45.8k | default: |
908 | 45.8k | break; |
909 | 49.4k | } |
910 | | |
911 | | // Everything else. |
912 | 49.3k | return lexDefault(Kind, First, End); |
913 | 49.4k | } |
914 | | |
915 | 1.29k | static void skipUTF8ByteOrderMark(const char *&First, const char *const End) { |
916 | 1.29k | if ((End - First) >= 3 && First[0] == '\xef'1.17k && First[1] == '\xbb'1 && |
917 | 1.29k | First[2] == '\xbf'1 ) |
918 | 1 | First += 3; |
919 | 1.29k | } |
920 | | |
921 | 1.29k | bool Scanner::scanImpl(const char *First, const char *const End) { |
922 | 1.29k | skipUTF8ByteOrderMark(First, End); |
923 | 149k | while (First != End) |
924 | 147k | if (lexPPLine(First, End)) |
925 | 3 | return true; |
926 | 1.28k | return false; |
927 | 1.29k | } |
928 | | |
929 | 1.29k | bool Scanner::scan(SmallVectorImpl<Directive> &Directives) { |
930 | 1.29k | bool Error = scanImpl(Input.begin(), Input.end()); |
931 | | |
932 | 1.29k | if (!Error) { |
933 | | // Add an EOF on success. |
934 | 1.28k | if (LastTokenPtr && |
935 | 1.28k | (1.12k Tokens.empty()1.12k || LastTokenPtr > Input.begin() + Tokens.back().Offset1.09k )) |
936 | 75 | pushDirective(tokens_present_before_eof); |
937 | 1.28k | pushDirective(pp_eof); |
938 | 1.28k | } |
939 | | |
940 | 1.29k | ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens; |
941 | 53.4k | for (const DirectiveWithTokens &DirWithToks : DirsWithToks) { |
942 | 53.4k | assert(RemainingTokens.size() >= DirWithToks.NumTokens); |
943 | 53.4k | Directives.emplace_back(DirWithToks.Kind, |
944 | 53.4k | RemainingTokens.take_front(DirWithToks.NumTokens)); |
945 | 53.4k | RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens); |
946 | 53.4k | } |
947 | 1.29k | assert(RemainingTokens.empty()); |
948 | | |
949 | 1.29k | return Error; |
950 | 1.29k | } |
951 | | |
952 | | bool clang::scanSourceForDependencyDirectives( |
953 | | StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, |
954 | | SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags, |
955 | 1.29k | SourceLocation InputSourceLoc) { |
956 | 1.29k | return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives); |
957 | 1.29k | } |
958 | | |
959 | | void clang::printDependencyDirectivesAsSource( |
960 | | StringRef Source, |
961 | | ArrayRef<dependency_directives_scan::Directive> Directives, |
962 | 132 | llvm::raw_ostream &OS) { |
963 | | // Add a space separator where it is convenient for testing purposes. |
964 | 132 | auto needsSpaceSeparator = |
965 | 132 | [](tok::TokenKind Prev, |
966 | 867 | const dependency_directives_scan::Token &Tok) -> bool { |
967 | 867 | if (Prev == Tok.Kind) |
968 | 187 | return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square, |
969 | 187 | tok::r_square); |
970 | 680 | if (Prev == tok::raw_identifier && |
971 | 680 | Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal, |
972 | 273 | tok::char_constant, tok::header_name)) |
973 | 47 | return true; |
974 | 633 | if (Prev == tok::r_paren && |
975 | 633 | Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal, |
976 | 39 | tok::char_constant, tok::unknown)) |
977 | 12 | return true; |
978 | 621 | if (Prev == tok::comma && |
979 | 621 | Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less)16 ) |
980 | 8 | return true; |
981 | 613 | return false; |
982 | 621 | }; |
983 | | |
984 | 376 | for (const dependency_directives_scan::Directive &Directive : Directives) { |
985 | 376 | if (Directive.Kind == tokens_present_before_eof) |
986 | 26 | OS << "<TokBeforeEOF>"; |
987 | 376 | std::optional<tok::TokenKind> PrevTokenKind; |
988 | 1.08k | for (const dependency_directives_scan::Token &Tok : Directive.Tokens) { |
989 | 1.08k | if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok)867 ) |
990 | 240 | OS << ' '; |
991 | 1.08k | PrevTokenKind = Tok.Kind; |
992 | 1.08k | OS << Source.slice(Tok.Offset, Tok.getEnd()); |
993 | 1.08k | } |
994 | 376 | } |
995 | 132 | } |