Coverage Report

Created: 2023-09-21 18:56

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/DependencyDirectivesScanner.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- DependencyDirectivesScanner.cpp ------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This is the interface for scanning header and source files to get the
11
/// minimum necessary preprocessor directives for evaluating includes. It
12
/// reduces the source down to #define, #include, #import, @import, and any
13
/// conditional preprocessor logic that contains one of those.
14
///
15
//===----------------------------------------------------------------------===//
16
17
#include "clang/Lex/DependencyDirectivesScanner.h"
18
#include "clang/Basic/CharInfo.h"
19
#include "clang/Basic/Diagnostic.h"
20
#include "clang/Lex/LexDiagnostic.h"
21
#include "clang/Lex/Lexer.h"
22
#include "clang/Lex/Pragma.h"
23
#include "llvm/ADT/ScopeExit.h"
24
#include "llvm/ADT/SmallString.h"
25
#include "llvm/ADT/StringMap.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include <optional>
28
29
using namespace clang;
30
using namespace clang::dependency_directives_scan;
31
using namespace llvm;
32
33
namespace {
34
35
struct DirectiveWithTokens {
36
  DirectiveKind Kind;
37
  unsigned NumTokens;
38
39
  DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
40
57.0k
      : Kind(Kind), NumTokens(NumTokens) {}
41
};
42
43
/// Does an efficient "scan" of the sources to detect the presence of
44
/// preprocessor (or module import) directives and collects the raw lexed tokens
45
/// for those directives so that the \p Lexer can "replay" them when the file is
46
/// included.
47
///
48
/// Note that the behavior of the raw lexer is affected by the language mode,
49
/// while at this point we want to do a scan and collect tokens once,
50
/// irrespective of the language mode that the file will get included in. To
51
/// compensate for that the \p Lexer, while "replaying", will adjust a token
52
/// where appropriate, when it could affect the preprocessor's state.
53
/// For example in a directive like
54
///
55
/// \code
56
///   #if __has_cpp_attribute(clang::fallthrough)
57
/// \endcode
58
///
59
/// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
60
/// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
61
/// while in C++ mode.
62
struct Scanner {
63
  Scanner(StringRef Input,
64
          SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
65
          DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
66
1.32k
      : Input(Input), Tokens(Tokens), Diags(Diags),
67
1.32k
        InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
68
1.32k
        TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
69
1.32k
                 Input.end()) {}
70
71
1.32k
  static LangOptions getLangOptsForDepScanning() {
72
1.32k
    LangOptions LangOpts;
73
    // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
74
1.32k
    LangOpts.ObjC = true;
75
1.32k
    LangOpts.LineComment = true;
76
    // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
77
    // R"()" literals.
78
1.32k
    return LangOpts;
79
1.32k
  }
80
81
  /// Lex the provided source and emit the directive tokens.
82
  ///
83
  /// \returns True on error.
84
  bool scan(SmallVectorImpl<Directive> &Directives);
85
86
private:
87
  /// Lexes next token and advances \p First and the \p Lexer.
88
  [[nodiscard]] dependency_directives_scan::Token &
89
  lexToken(const char *&First, const char *const End);
90
91
  dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
92
                                                        const char *const End);
93
94
  void skipLine(const char *&First, const char *const End);
95
  void skipDirective(StringRef Name, const char *&First, const char *const End);
96
97
  /// Returns the spelling of a string literal or identifier after performing
98
  /// any processing needed to handle \c clang::Token::NeedsCleaning.
99
  StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
100
101
  /// Lexes next token and if it is identifier returns its string, otherwise
102
  /// it skips the current line and returns \p std::nullopt.
103
  ///
104
  /// In any case (whatever the token kind) \p First and the \p Lexer will
105
  /// advance beyond the token.
106
  [[nodiscard]] std::optional<StringRef>
107
  tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
108
109
  /// Used when it is certain that next token is an identifier.
110
  [[nodiscard]] StringRef lexIdentifier(const char *&First,
111
                                        const char *const End);
112
113
  /// Lexes next token and returns true iff it is an identifier that matches \p
114
  /// Id, otherwise it skips the current line and returns false.
115
  ///
116
  /// In any case (whatever the token kind) \p First and the \p Lexer will
117
  /// advance beyond the token.
118
  [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
119
                                                const char *&First,
120
                                                const char *const End);
121
122
  /// Lexes next token and returns true iff it matches the kind \p K.
123
  /// Otherwise it skips the current line and returns false.
124
  ///
125
  /// In any case (whatever the token kind) \p First and the \p Lexer will
126
  /// advance beyond the token.
127
  [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
128
                                           const char *const End);
129
130
  /// Lexes next token and if it is string literal, returns its string.
131
  /// Otherwise, it skips the current line and returns \p std::nullopt.
132
  ///
133
  /// In any case (whatever the token kind) \p First and the \p Lexer will
134
  /// advance beyond the token.
135
  [[nodiscard]] std::optional<StringRef>
136
  tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
137
138
  [[nodiscard]] bool scanImpl(const char *First, const char *const End);
139
  [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
140
  [[nodiscard]] bool lexAt(const char *&First, const char *const End);
141
  [[nodiscard]] bool lexModule(const char *&First, const char *const End);
142
  [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
143
                               const char *const End);
144
  [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
145
  [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
146
  [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
147
  [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
148
                                const char *const End);
149
  [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
150
                                            const char *&First,
151
                                            const char *const End);
152
  void lexPPDirectiveBody(const char *&First, const char *const End);
153
154
57.0k
  DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
155
57.0k
    Tokens.append(CurDirToks);
156
57.0k
    DirsWithToks.emplace_back(Kind, CurDirToks.size());
157
57.0k
    CurDirToks.clear();
158
57.0k
    return DirsWithToks.back();
159
57.0k
  }
160
3.60k
  void popDirective() {
161
3.60k
    Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
162
3.60k
  }
163
24.5k
  DirectiveKind topDirective() const {
164
24.5k
    return DirsWithToks.empty() ? 
pp_none28
:
DirsWithToks.back().Kind24.4k
;
165
24.5k
  }
166
167
76.7k
  unsigned getOffsetAt(const char *CurPtr) const {
168
76.7k
    return CurPtr - Input.data();
169
76.7k
  }
170
171
  /// Reports a diagnostic if the diagnostic engine is provided. Always returns
172
  /// true at the end.
173
  bool reportError(const char *CurPtr, unsigned Err);
174
175
  StringMap<char> SplitIds;
176
  StringRef Input;
177
  SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
178
  DiagnosticsEngine *Diags;
179
  SourceLocation InputSourceLoc;
180
181
  const char *LastTokenPtr = nullptr;
182
  /// Keeps track of the tokens for the currently lexed directive. Once a
183
  /// directive is fully lexed and "committed" then the tokens get appended to
184
  /// \p Tokens and \p CurDirToks is cleared for the next directive.
185
  SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
186
  /// The directives that were lexed along with the number of tokens that each
187
  /// directive contains. The tokens of all the directives are kept in \p Tokens
188
  /// vector, in the same order as the directives order in \p DirsWithToks.
189
  SmallVector<DirectiveWithTokens, 64> DirsWithToks;
190
  LangOptions LangOpts;
191
  Lexer TheLexer;
192
};
193
194
} // end anonymous namespace
195
196
3
bool Scanner::reportError(const char *CurPtr, unsigned Err) {
197
3
  if (!Diags)
198
1
    return true;
199
2
  assert(CurPtr >= Input.data() && "invalid buffer ptr");
200
2
  Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
201
2
  return true;
202
2
}
203
204
159k
static void skipOverSpaces(const char *&First, const char *const End) {
205
362k
  while (First != End && 
isHorizontalWhitespace(*First)362k
)
206
203k
    ++First;
207
159k
}
208
209
[[nodiscard]] static bool isRawStringLiteral(const char *First,
210
2.28k
                                             const char *Current) {
211
2.28k
  assert(First <= Current);
212
213
  // Check if we can even back up.
214
2.28k
  if (*Current != '"' || 
First == Current2.27k
)
215
15
    return false;
216
217
  // Check for an "R".
218
2.26k
  --Current;
219
2.26k
  if (*Current != 'R')
220
2.26k
    return false;
221
3
  if (First == Current || 
!isAsciiIdentifierContinue(*--Current)2
)
222
3
    return true;
223
224
  // Check for a prefix of "u", "U", or "L".
225
0
  if (*Current == 'u' || *Current == 'U' || *Current == 'L')
226
0
    return First == Current || !isAsciiIdentifierContinue(*--Current);
227
228
  // Check for a prefix of "u8".
229
0
  if (*Current != '8' || First == Current || *Current-- != 'u')
230
0
    return false;
231
0
  return First == Current || !isAsciiIdentifierContinue(*--Current);
232
0
}
233
234
3
static void skipRawString(const char *&First, const char *const End) {
235
3
  assert(First[0] == '"');
236
3
  assert(First[-1] == 'R');
237
238
3
  const char *Last = ++First;
239
6
  while (Last != End && *Last != '(')
240
3
    ++Last;
241
3
  if (Last == End) {
242
0
    First = Last; // Hit the end... just give up.
243
0
    return;
244
0
  }
245
246
3
  StringRef Terminator(First, Last - First);
247
3
  for (;;) {
248
    // Move First to just past the next ")".
249
3
    First = Last;
250
46
    while (First != End && *First != ')')
251
43
      ++First;
252
3
    if (First == End)
253
0
      return;
254
3
    ++First;
255
256
    // Look ahead for the terminator sequence.
257
3
    Last = First;
258
6
    while (Last != End && size_t(Last - First) < Terminator.size() &&
259
6
           
Terminator[Last - First] == *Last3
)
260
3
      ++Last;
261
262
    // Check if we hit it (or the end of the file).
263
3
    if (Last == End) {
264
0
      First = Last;
265
0
      return;
266
0
    }
267
3
    if (size_t(Last - First) < Terminator.size())
268
0
      continue;
269
3
    if (*Last != '"')
270
0
      continue;
271
3
    First = Last + 1;
272
3
    return;
273
3
  }
274
3
}
275
276
// Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
277
254k
static unsigned isEOL(const char *First, const char *const End) {
278
254k
  if (First == End)
279
0
    return 0;
280
254k
  if (End - First > 1 && 
isVerticalWhitespace(First[0])254k
&&
281
254k
      
isVerticalWhitespace(First[1])95.6k
&&
First[0] != First[1]13.8k
)
282
10
    return 2;
283
254k
  return !!isVerticalWhitespace(First[0]);
284
254k
}
285
286
2.27k
static void skipString(const char *&First, const char *const End) {
287
2.27k
  assert(*First == '\'' || *First == '"' || *First == '<');
288
2.27k
  const char Terminator = *First == '<' ? 
'>'0
: *First;
289
49.2k
  for (++First; First != End && *First != Terminator; 
++First46.9k
) {
290
    // String and character literals don't extend past the end of the line.
291
46.9k
    if (isVerticalWhitespace(*First))
292
0
      return;
293
46.9k
    if (*First != '\\')
294
46.9k
      continue;
295
    // Skip past backslash to the next character. This ensures that the
296
    // character right after it is skipped as well, which matters if it's
297
    // the terminator.
298
2
    if (++First == End)
299
0
      return;
300
2
    if (!isWhitespace(*First))
301
2
      continue;
302
    // Whitespace after the backslash might indicate a line continuation.
303
0
    const char *FirstAfterBackslashPastSpace = First;
304
0
    skipOverSpaces(FirstAfterBackslashPastSpace, End);
305
0
    if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
306
      // Advance the character pointer to the next line for the next
307
      // iteration.
308
0
      First = FirstAfterBackslashPastSpace + NLSize - 1;
309
0
    }
310
0
  }
311
2.27k
  if (First != End)
312
2.27k
    ++First; // Finish off the string.
313
2.27k
}
314
315
// Returns the length of the skipped newline
316
92.0k
static unsigned skipNewline(const char *&First, const char *End) {
317
92.0k
  if (First == End)
318
0
    return 0;
319
92.0k
  assert(isVerticalWhitespace(*First));
320
92.0k
  unsigned Len = isEOL(First, End);
321
92.0k
  assert(Len && "expected newline");
322
92.0k
  First += Len;
323
92.0k
  return Len;
324
92.0k
}
325
326
48.8k
static bool wasLineContinuation(const char *First, unsigned EOLLen) {
327
48.8k
  return *(First - (int)EOLLen - 1) == '\\';
328
48.8k
}
329
330
3.98k
static void skipToNewlineRaw(const char *&First, const char *const End) {
331
4.24k
  for (;;) {
332
4.24k
    if (First == End)
333
0
      return;
334
335
4.24k
    unsigned Len = isEOL(First, End);
336
4.24k
    if (Len)
337
110
      return;
338
339
161k
    
do 4.13k
{
340
161k
      if (++First == End)
341
0
        return;
342
161k
      Len = isEOL(First, End);
343
161k
    } while (!Len);
344
345
4.13k
    if (First[-1] != '\\')
346
3.82k
      return;
347
348
309
    First += Len;
349
    // Keep skipping lines...
350
309
  }
351
3.98k
}
352
353
3.84k
static void skipLineComment(const char *&First, const char *const End) {
354
3.84k
  assert(First[0] == '/' && First[1] == '/');
355
3.84k
  First += 2;
356
3.84k
  skipToNewlineRaw(First, End);
357
3.84k
}
358
359
17.1k
static void skipBlockComment(const char *&First, const char *const End) {
360
17.1k
  assert(First[0] == '/' && First[1] == '*');
361
17.1k
  if (End - First < 4) {
362
0
    First = End;
363
0
    return;
364
0
  }
365
4.10M
  
for (First += 3; 17.1k
First != End;
++First4.08M
)
366
4.10M
    if (First[-1] == '*' && 
First[0] == '/'130k
) {
367
17.1k
      ++First;
368
17.1k
      return;
369
17.1k
    }
370
17.1k
}
371
372
/// \returns True if the current single quotation mark character is a C++ 14
373
/// digit separator.
374
static bool isQuoteCppDigitSeparator(const char *const Start,
375
                                     const char *const Cur,
376
11
                                     const char *const End) {
377
11
  assert(*Cur == '\'' && "expected quotation character");
378
  // skipLine called in places where we don't expect a valid number
379
  // body before `start` on the same line, so always return false at the start.
380
11
  if (Start == Cur)
381
0
    return false;
382
  // The previous character must be a valid PP number character.
383
  // Make sure that the L, u, U, u8 prefixes don't get marked as a
384
  // separator though.
385
11
  char Prev = *(Cur - 1);
386
11
  if (Prev == 'L' || 
Prev == 'U'10
||
Prev == 'u'9
)
387
3
    return false;
388
8
  if (Prev == '8' && 
(Cur - 1 != Start)2
&&
*(Cur - 2) == 'u'2
)
389
1
    return false;
390
7
  if (!isPreprocessingNumberBody(Prev))
391
3
    return false;
392
  // The next character should be a valid identifier body character.
393
4
  return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
394
7
}
395
396
92.0k
void Scanner::skipLine(const char *&First, const char *const End) {
397
92.0k
  for (;;) {
398
92.0k
    assert(First <= End);
399
92.0k
    if (First == End)
400
9
      return;
401
402
92.0k
    if (isVerticalWhitespace(*First)) {
403
43.1k
      skipNewline(First, End);
404
43.1k
      return;
405
43.1k
    }
406
48.8k
    const char *Start = First;
407
1.54M
    while (First != End && 
!isVerticalWhitespace(*First)1.54M
) {
408
      // Iterate over strings correctly to avoid comments and newlines.
409
1.49M
      if (*First == '"' ||
410
1.49M
          
(1.48M
*First == '\''1.48M
&&
!isQuoteCppDigitSeparator(Start, First, End)11
)) {
411
2.28k
        LastTokenPtr = First;
412
2.28k
        if (isRawStringLiteral(Start, First))
413
3
          skipRawString(First, End);
414
2.27k
        else
415
2.27k
          skipString(First, End);
416
2.28k
        continue;
417
2.28k
      }
418
419
      // Iterate over comments correctly.
420
1.48M
      if (*First != '/' || 
End - First < 26.16k
) {
421
1.48M
        LastTokenPtr = First;
422
1.48M
        ++First;
423
1.48M
        continue;
424
1.48M
      }
425
426
6.16k
      if (First[1] == '/') {
427
        // "//...".
428
299
        skipLineComment(First, End);
429
299
        continue;
430
299
      }
431
432
5.87k
      if (First[1] != '*') {
433
6
        LastTokenPtr = First;
434
6
        ++First;
435
6
        continue;
436
6
      }
437
438
      // "/*...*/".
439
5.86k
      skipBlockComment(First, End);
440
5.86k
    }
441
48.8k
    if (First == End)
442
6
      return;
443
444
    // Skip over the newline.
445
48.8k
    unsigned Len = skipNewline(First, End);
446
48.8k
    if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
447
48.8k
      break;
448
48.8k
  }
449
92.0k
}
450
451
void Scanner::skipDirective(StringRef Name, const char *&First,
452
143
                            const char *const End) {
453
143
  if (llvm::StringSwitch<bool>(Name)
454
143
          .Case("warning", true)
455
143
          .Case("error", true)
456
143
          .Default(false))
457
    // Do not process quotes or comments.
458
141
    skipToNewlineRaw(First, End);
459
2
  else
460
2
    skipLine(First, End);
461
143
}
462
463
147k
static void skipWhitespace(const char *&First, const char *const End) {
464
159k
  for (;;) {
465
159k
    assert(First <= End);
466
159k
    skipOverSpaces(First, End);
467
468
159k
    if (End - First < 2)
469
200
      return;
470
471
159k
    if (First[0] == '\\' && 
isVerticalWhitespace(First[1])1
) {
472
1
      skipNewline(++First, End);
473
1
      continue;
474
1
    }
475
476
    // Check for a non-comment character.
477
159k
    if (First[0] != '/')
478
144k
      return;
479
480
    // "// ...".
481
14.8k
    if (First[1] == '/') {
482
3.54k
      skipLineComment(First, End);
483
3.54k
      return;
484
3.54k
    }
485
486
    // Cannot be a comment.
487
11.2k
    if (First[1] != '*')
488
0
      return;
489
490
    // "/*...*/".
491
11.2k
    skipBlockComment(First, End);
492
11.2k
  }
493
147k
}
494
495
bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
496
49
                                     const char *const End) {
497
49
  const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
498
137
  for (;;) {
499
137
    const dependency_directives_scan::Token &Tok = lexToken(First, End);
500
137
    if (Tok.is(tok::eof))
501
2
      return reportError(
502
2
          DirectiveLoc,
503
2
          diag::err_dep_source_scanner_missing_semi_after_at_import);
504
135
    if (Tok.is(tok::semi))
505
46
      break;
506
135
  }
507
47
  pushDirective(Kind);
508
47
  skipWhitespace(First, End);
509
47
  if (First == End)
510
2
    return false;
511
45
  if (!isVerticalWhitespace(*First))
512
1
    return reportError(
513
1
        DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
514
44
  skipNewline(First, End);
515
44
  return false;
516
45
}
517
518
dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
519
472k
                                                     const char *const End) {
520
472k
  clang::Token Tok;
521
472k
  TheLexer.LexFromRawLexer(Tok);
522
472k
  First = Input.data() + TheLexer.getCurrentBufferOffset();
523
472k
  assert(First <= End);
524
525
472k
  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
526
472k
  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
527
472k
                          Tok.getFlags());
528
472k
  return CurDirToks.back();
529
472k
}
530
531
dependency_directives_scan::Token &
532
3.52k
Scanner::lexIncludeFilename(const char *&First, const char *const End) {
533
3.52k
  clang::Token Tok;
534
3.52k
  TheLexer.LexIncludeFilename(Tok);
535
3.52k
  First = Input.data() + TheLexer.getCurrentBufferOffset();
536
3.52k
  assert(First <= End);
537
538
3.52k
  unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
539
3.52k
  CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
540
3.52k
                          Tok.getFlags());
541
3.52k
  return CurDirToks.back();
542
3.52k
}
543
544
55.6k
void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
545
332k
  while (
true332k
) {
546
332k
    const dependency_directives_scan::Token &Tok = lexToken(First, End);
547
332k
    if (Tok.is(tok::eod))
548
55.6k
      break;
549
332k
  }
550
55.6k
}
551
552
StringRef
553
79.0k
Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
554
79.0k
  bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
555
79.0k
  if (LLVM_LIKELY(!NeedsCleaning))
556
79.0k
    return Input.slice(Tok.Offset, Tok.getEnd());
557
558
3
  SmallString<64> Spelling;
559
3
  Spelling.resize(Tok.Length);
560
561
  // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
562
  // in the Lexer). Currently we cannot see them due to our LangOpts.
563
564
3
  unsigned SpellingLength = 0;
565
3
  const char *BufPtr = Input.begin() + Tok.Offset;
566
3
  const char *AfterIdent = Input.begin() + Tok.getEnd();
567
40
  while (BufPtr < AfterIdent) {
568
37
    unsigned Size;
569
37
    Spelling[SpellingLength++] =
570
37
        Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
571
37
    BufPtr += Size;
572
37
  }
573
574
3
  return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
575
3
      .first->first();
576
79.0k
}
577
578
std::optional<StringRef>
579
79.0k
Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
580
79.0k
  const dependency_directives_scan::Token &Tok = lexToken(First, End);
581
79.0k
  if (Tok.isNot(tok::raw_identifier)) {
582
5
    if (!Tok.is(tok::eod))
583
0
      skipLine(First, End);
584
5
    return std::nullopt;
585
5
  }
586
587
79.0k
  return cleanStringIfNeeded(Tok);
588
79.0k
}
589
590
11.3k
StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
591
11.3k
  std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
592
11.3k
  assert(Id && "expected identifier token");
593
11.3k
  return *Id;
594
11.3k
}
595
596
bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
597
4.44k
                                         const char *const End) {
598
4.44k
  if (std::optional<StringRef> FoundId =
599
4.44k
          tryLexIdentifierOrSkipLine(First, End)) {
600
4.44k
    if (*FoundId == Id)
601
78
      return true;
602
4.36k
    skipLine(First, End);
603
4.36k
  }
604
4.37k
  return false;
605
4.44k
}
606
607
bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
608
82
                                    const char *const End) {
609
82
  const dependency_directives_scan::Token &Tok = lexToken(First, End);
610
82
  if (Tok.is(K))
611
81
    return true;
612
1
  skipLine(First, End);
613
1
  return false;
614
82
}
615
616
std::optional<StringRef>
617
Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
618
43
                                       const char *const End) {
619
43
  const dependency_directives_scan::Token &Tok = lexToken(First, End);
620
43
  if (!tok::isStringLiteral(Tok.Kind)) {
621
5
    if (!Tok.is(tok::eod))
622
5
      skipLine(First, End);
623
5
    return std::nullopt;
624
5
  }
625
626
38
  return cleanStringIfNeeded(Tok);
627
43
}
628
629
28
bool Scanner::lexAt(const char *&First, const char *const End) {
630
  // Handle "@import".
631
632
  // Lex '@'.
633
28
  const dependency_directives_scan::Token &AtTok = lexToken(First, End);
634
28
  assert(AtTok.is(tok::at));
635
28
  (void)AtTok;
636
637
28
  if (!isNextIdentifierOrSkipLine("import", First, End))
638
0
    return false;
639
28
  return lexModuleDirectiveBody(decl_at_import, First, End);
640
28
}
641
642
11.3k
bool Scanner::lexModule(const char *&First, const char *const End) {
643
11.3k
  StringRef Id = lexIdentifier(First, End);
644
11.3k
  bool Export = false;
645
11.3k
  if (Id == "export") {
646
4
    Export = true;
647
4
    std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
648
4
    if (!NextId)
649
0
      return false;
650
4
    Id = *NextId;
651
4
  }
652
653
11.3k
  if (Id != "module" && 
Id != "import"11.3k
) {
654
11.2k
    skipLine(First, End);
655
11.2k
    return false;
656
11.2k
  }
657
658
30
  skipWhitespace(First, End);
659
660
  // Ignore this as a module directive if the next character can't be part of
661
  // an import.
662
663
30
  switch (*First) {
664
1
  case ':':
665
2
  case '<':
666
2
  case '"':
667
2
    break;
668
28
  default:
669
28
    if (!isAsciiIdentifierContinue(*First)) {
670
9
      skipLine(First, End);
671
9
      return false;
672
9
    }
673
30
  }
674
675
21
  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
676
677
21
  DirectiveKind Kind;
678
21
  if (Id == "module")
679
5
    Kind = Export ? 
cxx_export_module_decl2
:
cxx_module_decl3
;
680
16
  else
681
16
    Kind = Export ? 
cxx_export_import_decl1
:
cxx_import_decl15
;
682
683
21
  return lexModuleDirectiveBody(Kind, First, End);
684
30
}
685
686
44
bool Scanner::lex_Pragma(const char *&First, const char *const End) {
687
44
  if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
688
1
    return false;
689
690
43
  std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
691
692
43
  if (!Str || 
!isNextTokenOrSkipLine(tok::r_paren, First, End)38
)
693
5
    return false;
694
695
38
  SmallString<64> Buffer(*Str);
696
38
  prepare_PragmaString(Buffer);
697
698
  // Use a new scanner instance since the tokens will be inside the allocated
699
  // string. We should already have captured all the relevant tokens in the
700
  // current scanner.
701
38
  SmallVector<dependency_directives_scan::Token> DiscardTokens;
702
38
  const char *Begin = Buffer.c_str();
703
38
  Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
704
38
                        InputSourceLoc};
705
706
38
  PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
707
38
  if (PragmaScanner.lexPragma(Begin, Buffer.end()))
708
0
    return true;
709
710
38
  DirectiveKind K = PragmaScanner.topDirective();
711
38
  if (K == pp_none) {
712
28
    skipLine(First, End);
713
28
    return false;
714
28
  }
715
716
10
  assert(Begin == Buffer.end());
717
10
  pushDirective(K);
718
10
  return false;
719
10
}
720
721
2.30k
bool Scanner::lexPragma(const char *&First, const char *const End) {
722
2.30k
  std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
723
2.30k
  if (!FoundId)
724
0
    return false;
725
726
2.30k
  StringRef Id = *FoundId;
727
2.30k
  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
728
2.30k
                  .Case("once", pp_pragma_once)
729
2.30k
                  .Case("push_macro", pp_pragma_push_macro)
730
2.30k
                  .Case("pop_macro", pp_pragma_pop_macro)
731
2.30k
                  .Case("include_alias", pp_pragma_include_alias)
732
2.30k
                  .Default(pp_none);
733
2.30k
  if (Kind != pp_none) {
734
37
    lexPPDirectiveBody(First, End);
735
37
    pushDirective(Kind);
736
37
    return false;
737
37
  }
738
739
2.26k
  if (Id != "clang") {
740
2.22k
    skipLine(First, End);
741
2.22k
    return false;
742
2.22k
  }
743
744
40
  FoundId = tryLexIdentifierOrSkipLine(First, End);
745
40
  if (!FoundId)
746
2
    return false;
747
38
  Id = *FoundId;
748
749
  // #pragma clang system_header
750
38
  if (Id == "system_header") {
751
1
    lexPPDirectiveBody(First, End);
752
1
    pushDirective(pp_pragma_system_header);
753
1
    return false;
754
1
  }
755
756
37
  if (Id != "module") {
757
27
    skipLine(First, End);
758
27
    return false;
759
27
  }
760
761
  // #pragma clang module.
762
10
  if (!isNextIdentifierOrSkipLine("import", First, End))
763
4
    return false;
764
765
  // #pragma clang module import.
766
6
  lexPPDirectiveBody(First, End);
767
6
  pushDirective(pp_pragma_import);
768
6
  return false;
769
10
}
770
771
9.09k
bool Scanner::lexEndif(const char *&First, const char *const End) {
772
  // Strip out "#else" if it's empty.
773
9.09k
  if (topDirective() == pp_else)
774
727
    popDirective();
775
776
  // If "#ifdef" is empty, strip it and skip the "#endif".
777
  //
778
  // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
779
  // we can skip empty `#if` and `#elif` blocks as well after scanning for a
780
  // literal __has_include in the condition.  Even without that rule we could
781
  // drop the tokens if we scan for identifiers in the condition and find none.
782
9.09k
  if (topDirective() == pp_ifdef || 
topDirective() == pp_ifndef6.27k
) {
783
2.87k
    popDirective();
784
2.87k
    skipLine(First, End);
785
2.87k
    return false;
786
2.87k
  }
787
788
6.22k
  return lexDefault(pp_endif, First, End);
789
9.09k
}
790
791
bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
792
55.6k
                         const char *const End) {
793
55.6k
  lexPPDirectiveBody(First, End);
794
55.6k
  pushDirective(Kind);
795
55.6k
  return false;
796
55.6k
}
797
798
147k
static bool isStartOfRelevantLine(char First) {
799
147k
  switch (First) {
800
60.9k
  case '#':
801
60.9k
  case '@':
802
65.2k
  case 'i':
803
68.2k
  case 'e':
804
72.2k
  case 'm':
805
76.6k
  case '_':
806
76.6k
    return true;
807
147k
  }
808
71.1k
  return false;
809
147k
}
810
811
147k
bool Scanner::lexPPLine(const char *&First, const char *const End) {
812
147k
  assert(First != End);
813
814
147k
  skipWhitespace(First, End);
815
147k
  assert(First <= End);
816
147k
  if (First == End)
817
8
    return false;
818
819
147k
  if (!isStartOfRelevantLine(*First)) {
820
71.1k
    skipLine(First, End);
821
71.1k
    assert(First <= End);
822
71.1k
    return false;
823
71.1k
  }
824
825
76.6k
  LastTokenPtr = First;
826
827
76.6k
  TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
828
829
76.6k
  auto ScEx1 = make_scope_exit([&]() {
830
    /// Clear Scanner's CurDirToks before returning, in case we didn't push a
831
    /// new directive.
832
76.6k
    CurDirToks.clear();
833
76.6k
  });
834
835
  // Handle "@import".
836
76.6k
  if (*First == '@')
837
28
    return lexAt(First, End);
838
839
76.6k
  if (*First == 'i' || 
*First == 'e'72.3k
||
*First == 'm'69.3k
)
840
11.3k
    return lexModule(First, End);
841
842
65.3k
  if (*First == '_') {
843
4.41k
    if (isNextIdentifierOrSkipLine("_Pragma", First, End))
844
44
      return lex_Pragma(First, End);
845
4.36k
    return false;
846
4.41k
  }
847
848
  // Handle preprocessing directives.
849
850
60.9k
  TheLexer.setParsingPreprocessorDirective(true);
851
60.9k
  auto ScEx2 = make_scope_exit(
852
60.9k
      [&]() 
{ TheLexer.setParsingPreprocessorDirective(false); }60.9k
);
853
854
  // Lex '#'.
855
60.9k
  const dependency_directives_scan::Token &HashTok = lexToken(First, End);
856
60.9k
  if (HashTok.is(tok::hashhash)) {
857
    // A \p tok::hashhash at this location is passed by the preprocessor to the
858
    // parser to interpret, like any other token. So for dependency scanning
859
    // skip it like a normal token not affecting the preprocessor.
860
2
    skipLine(First, End);
861
2
    assert(First <= End);
862
2
    return false;
863
2
  }
864
60.9k
  assert(HashTok.is(tok::hash));
865
60.9k
  (void)HashTok;
866
867
60.9k
  std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
868
60.9k
  if (!FoundId)
869
1
    return false;
870
871
60.9k
  StringRef Id = *FoundId;
872
873
60.9k
  if (Id == "pragma")
874
2.26k
    return lexPragma(First, End);
875
876
58.6k
  auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
877
58.6k
                  .Case("include", pp_include)
878
58.6k
                  .Case("__include_macros", pp___include_macros)
879
58.6k
                  .Case("define", pp_define)
880
58.6k
                  .Case("undef", pp_undef)
881
58.6k
                  .Case("import", pp_import)
882
58.6k
                  .Case("include_next", pp_include_next)
883
58.6k
                  .Case("if", pp_if)
884
58.6k
                  .Case("ifdef", pp_ifdef)
885
58.6k
                  .Case("ifndef", pp_ifndef)
886
58.6k
                  .Case("elif", pp_elif)
887
58.6k
                  .Case("elifdef", pp_elifdef)
888
58.6k
                  .Case("elifndef", pp_elifndef)
889
58.6k
                  .Case("else", pp_else)
890
58.6k
                  .Case("endif", pp_endif)
891
58.6k
                  .Default(pp_none);
892
58.6k
  if (Kind == pp_none) {
893
143
    skipDirective(Id, First, End);
894
143
    return false;
895
143
  }
896
897
58.5k
  if (Kind == pp_endif)
898
9.09k
    return lexEndif(First, End);
899
900
49.4k
  switch (Kind) {
901
3.50k
  case pp_include:
902
3.50k
  case pp___include_macros:
903
3.50k
  case pp_include_next:
904
3.53k
  case pp_import:
905
3.53k
    lexIncludeFilename(First, End);
906
3.53k
    break;
907
45.8k
  default:
908
45.8k
    break;
909
49.4k
  }
910
911
  // Everything else.
912
49.3k
  return lexDefault(Kind, First, End);
913
49.4k
}
914
915
1.29k
static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
916
1.29k
  if ((End - First) >= 3 && 
First[0] == '\xef'1.17k
&&
First[1] == '\xbb'1
&&
917
1.29k
      
First[2] == '\xbf'1
)
918
1
    First += 3;
919
1.29k
}
920
921
1.29k
bool Scanner::scanImpl(const char *First, const char *const End) {
922
1.29k
  skipUTF8ByteOrderMark(First, End);
923
149k
  while (First != End)
924
147k
    if (lexPPLine(First, End))
925
3
      return true;
926
1.28k
  return false;
927
1.29k
}
928
929
1.29k
bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
930
1.29k
  bool Error = scanImpl(Input.begin(), Input.end());
931
932
1.29k
  if (!Error) {
933
    // Add an EOF on success.
934
1.28k
    if (LastTokenPtr &&
935
1.28k
        
(1.12k
Tokens.empty()1.12k
||
LastTokenPtr > Input.begin() + Tokens.back().Offset1.09k
))
936
75
      pushDirective(tokens_present_before_eof);
937
1.28k
    pushDirective(pp_eof);
938
1.28k
  }
939
940
1.29k
  ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
941
53.4k
  for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
942
53.4k
    assert(RemainingTokens.size() >= DirWithToks.NumTokens);
943
53.4k
    Directives.emplace_back(DirWithToks.Kind,
944
53.4k
                            RemainingTokens.take_front(DirWithToks.NumTokens));
945
53.4k
    RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
946
53.4k
  }
947
1.29k
  assert(RemainingTokens.empty());
948
949
1.29k
  return Error;
950
1.29k
}
951
952
bool clang::scanSourceForDependencyDirectives(
953
    StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
954
    SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
955
1.29k
    SourceLocation InputSourceLoc) {
956
1.29k
  return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
957
1.29k
}
958
959
void clang::printDependencyDirectivesAsSource(
960
    StringRef Source,
961
    ArrayRef<dependency_directives_scan::Directive> Directives,
962
132
    llvm::raw_ostream &OS) {
963
  // Add a space separator where it is convenient for testing purposes.
964
132
  auto needsSpaceSeparator =
965
132
      [](tok::TokenKind Prev,
966
867
         const dependency_directives_scan::Token &Tok) -> bool {
967
867
    if (Prev == Tok.Kind)
968
187
      return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
969
187
                          tok::r_square);
970
680
    if (Prev == tok::raw_identifier &&
971
680
        Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
972
273
                    tok::char_constant, tok::header_name))
973
47
      return true;
974
633
    if (Prev == tok::r_paren &&
975
633
        Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
976
39
                    tok::char_constant, tok::unknown))
977
12
      return true;
978
621
    if (Prev == tok::comma &&
979
621
        
Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less)16
)
980
8
      return true;
981
613
    return false;
982
621
  };
983
984
376
  for (const dependency_directives_scan::Directive &Directive : Directives) {
985
376
    if (Directive.Kind == tokens_present_before_eof)
986
26
      OS << "<TokBeforeEOF>";
987
376
    std::optional<tok::TokenKind> PrevTokenKind;
988
1.08k
    for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
989
1.08k
      if (PrevTokenKind && 
needsSpaceSeparator(*PrevTokenKind, Tok)867
)
990
240
        OS << ' ';
991
1.08k
      PrevTokenKind = Tok.Kind;
992
1.08k
      OS << Source.slice(Tok.Offset, Tok.getEnd());
993
1.08k
    }
994
376
  }
995
132
}