Coverage Report

Created: 2021-08-24 07:12

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/Lexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file implements the Lexer and Token interfaces.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "clang/Lex/Lexer.h"
14
#include "UnicodeCharSets.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/Diagnostic.h"
17
#include "clang/Basic/IdentifierTable.h"
18
#include "clang/Basic/LLVM.h"
19
#include "clang/Basic/LangOptions.h"
20
#include "clang/Basic/SourceLocation.h"
21
#include "clang/Basic/SourceManager.h"
22
#include "clang/Basic/TokenKinds.h"
23
#include "clang/Lex/LexDiagnostic.h"
24
#include "clang/Lex/LiteralSupport.h"
25
#include "clang/Lex/MultipleIncludeOpt.h"
26
#include "clang/Lex/Preprocessor.h"
27
#include "clang/Lex/PreprocessorOptions.h"
28
#include "clang/Lex/Token.h"
29
#include "llvm/ADT/None.h"
30
#include "llvm/ADT/Optional.h"
31
#include "llvm/ADT/STLExtras.h"
32
#include "llvm/ADT/StringExtras.h"
33
#include "llvm/ADT/StringRef.h"
34
#include "llvm/ADT/StringSwitch.h"
35
#include "llvm/Support/Compiler.h"
36
#include "llvm/Support/ConvertUTF.h"
37
#include "llvm/Support/MathExtras.h"
38
#include "llvm/Support/MemoryBufferRef.h"
39
#include "llvm/Support/NativeFormatting.h"
40
#include "llvm/Support/UnicodeCharRanges.h"
41
#include <algorithm>
42
#include <cassert>
43
#include <cstddef>
44
#include <cstdint>
45
#include <cstring>
46
#include <string>
47
#include <tuple>
48
#include <utility>
49
50
using namespace clang;
51
52
//===----------------------------------------------------------------------===//
53
// Token Class Implementation
54
//===----------------------------------------------------------------------===//
55
56
/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57
442k
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58
442k
  if (isAnnotation())
59
2
    return false;
60
442k
  if (IdentifierInfo *II = getIdentifierInfo())
61
431k
    return II->getObjCKeywordID() == objcKey;
62
11.4k
  return false;
63
442k
}
64
65
/// getObjCKeywordID - Return the ObjC keyword kind.
66
960k
tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67
960k
  if (isAnnotation())
68
1
    return tok::objc_not_keyword;
69
960k
  IdentifierInfo *specId = getIdentifierInfo();
70
960k
  return specId ? 
specId->getObjCKeywordID()836k
:
tok::objc_not_keyword123k
;
71
960k
}
72
73
//===----------------------------------------------------------------------===//
74
// Lexer Class Implementation
75
//===----------------------------------------------------------------------===//
76
77
0
void Lexer::anchor() {}
78
79
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80
55.8M
                      const char *BufEnd) {
81
55.8M
  BufferStart = BufStart;
82
55.8M
  BufferPtr = BufPtr;
83
55.8M
  BufferEnd = BufEnd;
84
85
55.8M
  assert(BufEnd[0] == 0 &&
86
55.8M
         "We assume that the input buffer has a null character at the end"
87
55.8M
         " to simplify lexing!");
88
89
  // Check whether we have a BOM in the beginning of the buffer. If yes - act
90
  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91
  // skip the UTF-8 BOM if it's present.
92
55.8M
  if (BufferStart == BufferPtr) {
93
    // Determine the size of the BOM.
94
1.61M
    StringRef Buf(BufferStart, BufferEnd - BufferStart);
95
1.61M
    size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96
1.61M
      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97
1.61M
      .Default(0);
98
99
    // Skip the BOM.
100
1.61M
    BufferPtr += BOMLength;
101
1.61M
  }
102
103
55.8M
  Is_PragmaLexer = false;
104
55.8M
  CurrentConflictMarkerState = CMK_None;
105
106
  // Start of the file is a start of line.
107
55.8M
  IsAtStartOfLine = true;
108
55.8M
  IsAtPhysicalStartOfLine = true;
109
110
55.8M
  HasLeadingSpace = false;
111
55.8M
  HasLeadingEmptyMacro = false;
112
113
  // We are not after parsing a #.
114
55.8M
  ParsingPreprocessorDirective = false;
115
116
  // We are not after parsing #include.
117
55.8M
  ParsingFilename = false;
118
119
  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
120
  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
121
  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122
  // or otherwise skipping over tokens.
123
55.8M
  LexingRawMode = false;
124
125
  // Default to not keeping comments.
126
55.8M
  ExtendedTokenMode = 0;
127
128
55.8M
  NewLinePtr = nullptr;
129
55.8M
}
130
131
/// Lexer constructor - Create a new lexer object for the specified buffer
132
/// with the specified preprocessor managing the lexing process.  This lexer
133
/// assumes that the associated file buffer and Preprocessor objects will
134
/// outlive it, so it doesn't take ownership of either of them.
135
Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
136
             Preprocessor &PP)
137
    : PreprocessorLexer(&PP, FID),
138
      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
139
1.42M
      LangOpts(PP.getLangOpts()) {
140
1.42M
  InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
141
1.42M
            InputFile.getBufferEnd());
142
143
1.42M
  resetExtendedTokenMode();
144
1.42M
}
145
146
/// Lexer constructor - Create a new raw lexer object.  This object is only
147
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
148
/// range will outlive it, so it doesn't take ownership of it.
149
Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
150
             const char *BufStart, const char *BufPtr, const char *BufEnd)
151
54.3M
    : FileLoc(fileloc), LangOpts(langOpts) {
152
54.3M
  InitLexer(BufStart, BufPtr, BufEnd);
153
154
  // We *are* in raw mode.
155
54.3M
  LexingRawMode = true;
156
54.3M
}
157
158
/// Lexer constructor - Create a new raw lexer object.  This object is only
159
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
160
/// range will outlive it, so it doesn't take ownership of it.
161
Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
162
             const SourceManager &SM, const LangOptions &langOpts)
163
    : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
164
74.4k
            FromFile.getBufferStart(), FromFile.getBufferEnd()) {}
Unexecuted instantiation: clang::Lexer::Lexer(clang::FileID, llvm::MemoryBufferRef const&, clang::SourceManager const&, clang::LangOptions const&)
clang::Lexer::Lexer(clang::FileID, llvm::MemoryBufferRef const&, clang::SourceManager const&, clang::LangOptions const&)
Line
Count
Source
164
74.4k
            FromFile.getBufferStart(), FromFile.getBufferEnd()) {}
165
166
98.3M
void Lexer::resetExtendedTokenMode() {
167
98.3M
  assert(PP && "Cannot reset token mode without a preprocessor");
168
98.3M
  if (LangOpts.TraditionalCPP)
169
1.05k
    SetKeepWhitespaceMode(true);
170
98.3M
  else
171
98.3M
    SetCommentRetentionState(PP->getCommentRetentionState());
172
98.3M
}
173
174
/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
175
/// _Pragma expansion.  This has a variety of magic semantics that this method
176
/// sets up.  It returns a new'd Lexer that must be delete'd when done.
177
///
178
/// On entrance to this routine, TokStartLoc is a macro location which has a
179
/// spelling loc that indicates the bytes to be lexed for the token and an
180
/// expansion location that indicates where all lexed tokens should be
181
/// "expanded from".
182
///
183
/// TODO: It would really be nice to make _Pragma just be a wrapper around a
184
/// normal lexer that remaps tokens as they fly by.  This would require making
185
/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
186
/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
187
/// out of the critical path of the lexer!
188
///
189
Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
190
                                 SourceLocation ExpansionLocStart,
191
                                 SourceLocation ExpansionLocEnd,
192
600k
                                 unsigned TokLen, Preprocessor &PP) {
193
600k
  SourceManager &SM = PP.getSourceManager();
194
195
  // Create the lexer as if we were going to lex the file normally.
196
600k
  FileID SpellingFID = SM.getFileID(SpellingLoc);
197
600k
  llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
198
600k
  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
199
200
  // Now that the lexer is created, change the start/end locations so that we
201
  // just lex the subsection of the file that we want.  This is lexing from a
202
  // scratch buffer.
203
600k
  const char *StrData = SM.getCharacterData(SpellingLoc);
204
205
600k
  L->BufferPtr = StrData;
206
600k
  L->BufferEnd = StrData+TokLen;
207
600k
  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
208
209
  // Set the SourceLocation with the remapping information.  This ensures that
210
  // GetMappedTokenLoc will remap the tokens as they are lexed.
211
0
  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
212
600k
                                     ExpansionLocStart,
213
600k
                                     ExpansionLocEnd, TokLen);
214
215
  // Ensure that the lexer thinks it is inside a directive, so that end \n will
216
  // return an EOD token.
217
600k
  L->ParsingPreprocessorDirective = true;
218
219
  // This lexer really is for _Pragma.
220
600k
  L->Is_PragmaLexer = true;
221
600k
  return L;
222
600k
}
223
224
30
bool Lexer::skipOver(unsigned NumBytes) {
225
30
  IsAtPhysicalStartOfLine = true;
226
30
  IsAtStartOfLine = true;
227
30
  if ((BufferPtr + NumBytes) > BufferEnd)
228
0
    return true;
229
30
  BufferPtr += NumBytes;
230
30
  return false;
231
30
}
232
233
424
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
424
  typename T::size_type i = 0, e = Str.size();
235
34.5k
  while (i < e) {
236
34.1k
    if (Str[i] == '\\' || 
Str[i] == Quote34.1k
) {
237
206
      Str.insert(Str.begin() + i, '\\');
238
206
      i += 2;
239
206
      ++e;
240
33.9k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'33.9k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
17
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'13
) &&
243
17
          
Str[i] != Str[i + 1]4
) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
17
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
17
        Str[i] = '\\';
249
17
        Str.insert(Str.begin() + i + 1, 'n');
250
17
        ++e;
251
17
      }
252
17
      i += 2;
253
17
    } else
254
33.9k
      ++i;
255
34.1k
  }
256
424
}
Lexer.cpp:void StringifyImpl<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, char)
Line
Count
Source
233
101
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
101
  typename T::size_type i = 0, e = Str.size();
235
2.30k
  while (i < e) {
236
2.20k
    if (Str[i] == '\\' || 
Str[i] == Quote2.19k
) {
237
195
      Str.insert(Str.begin() + i, '\\');
238
195
      i += 2;
239
195
      ++e;
240
2.00k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'2.00k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
9
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'7
) &&
243
9
          
Str[i] != Str[i + 1]2
) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
9
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
9
        Str[i] = '\\';
249
9
        Str.insert(Str.begin() + i + 1, 'n');
250
9
        ++e;
251
9
      }
252
9
      i += 2;
253
9
    } else
254
2.00k
      ++i;
255
2.20k
  }
256
101
}
Lexer.cpp:void StringifyImpl<llvm::SmallVectorImpl<char> >(llvm::SmallVectorImpl<char>&, char)
Line
Count
Source
233
323
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
323
  typename T::size_type i = 0, e = Str.size();
235
32.2k
  while (i < e) {
236
31.9k
    if (Str[i] == '\\' || 
Str[i] == Quote31.9k
) {
237
11
      Str.insert(Str.begin() + i, '\\');
238
11
      i += 2;
239
11
      ++e;
240
31.9k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'31.9k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
8
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'6
) &&
243
8
          
Str[i] != Str[i + 1]2
) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
8
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
8
        Str[i] = '\\';
249
8
        Str.insert(Str.begin() + i + 1, 'n');
250
8
        ++e;
251
8
      }
252
8
      i += 2;
253
8
    } else
254
31.9k
      ++i;
255
31.9k
  }
256
323
}
257
258
101
std::string Lexer::Stringify(StringRef Str, bool Charify) {
259
101
  std::string Result = std::string(Str);
260
101
  char Quote = Charify ? 
'\''0
: '"';
261
101
  StringifyImpl(Result, Quote);
262
101
  return Result;
263
101
}
264
265
323
void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
266
267
//===----------------------------------------------------------------------===//
268
// Token Spelling
269
//===----------------------------------------------------------------------===//
270
271
/// Slow case of getSpelling. Extract the characters comprising the
272
/// spelling of this token from the provided input buffer.
273
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
274
4.21M
                              const LangOptions &LangOpts, char *Spelling) {
275
4.21M
  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
276
277
0
  size_t Length = 0;
278
4.21M
  const char *BufEnd = BufPtr + Tok.getLength();
279
280
4.21M
  if (tok::isStringLiteral(Tok.getKind())) {
281
    // Munch the encoding-prefix and opening double-quote.
282
484
    while (BufPtr < BufEnd) {
283
484
      unsigned Size;
284
484
      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
285
484
      BufPtr += Size;
286
287
484
      if (Spelling[Length - 1] == '"')
288
458
        break;
289
484
    }
290
291
    // Raw string literals need special handling; trigraph expansion and line
292
    // splicing do not occur within their d-char-sequence nor within their
293
    // r-char-sequence.
294
458
    if (Length >= 2 &&
295
458
        
Spelling[Length - 2] == 'R'12
&&
Spelling[Length - 1] == '"'12
) {
296
      // Search backwards from the end of the token to find the matching closing
297
      // quote.
298
12
      const char *RawEnd = BufEnd;
299
30
      do --RawEnd; while (*RawEnd != '"');
300
12
      size_t RawLength = RawEnd - BufPtr + 1;
301
302
      // Everything between the quotes is included verbatim in the spelling.
303
12
      memcpy(Spelling + Length, BufPtr, RawLength);
304
12
      Length += RawLength;
305
12
      BufPtr += RawLength;
306
307
      // The rest of the token is lexed normally.
308
12
    }
309
458
  }
310
311
134M
  while (BufPtr < BufEnd) {
312
130M
    unsigned Size;
313
130M
    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
314
130M
    BufPtr += Size;
315
130M
  }
316
317
4.21M
  assert(Length < Tok.getLength() &&
318
4.21M
         "NeedsCleaning flag set on token that didn't need cleaning!");
319
0
  return Length;
320
4.21M
}
321
322
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
323
/// token are the characters used to represent the token in the source file
324
/// after trigraph expansion and escaped-newline folding.  In particular, this
325
/// wants to get the true, uncanonicalized, spelling of things like digraphs
326
/// UCNs, etc.
327
StringRef Lexer::getSpelling(SourceLocation loc,
328
                             SmallVectorImpl<char> &buffer,
329
                             const SourceManager &SM,
330
                             const LangOptions &options,
331
1.47k
                             bool *invalid) {
332
  // Break down the source location.
333
1.47k
  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
334
335
  // Try to the load the file buffer.
336
1.47k
  bool invalidTemp = false;
337
1.47k
  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
338
1.47k
  if (invalidTemp) {
339
0
    if (invalid) *invalid = true;
340
0
    return {};
341
0
  }
342
343
1.47k
  const char *tokenBegin = file.data() + locInfo.second;
344
345
  // Lex from the start of the given location.
346
1.47k
  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
347
1.47k
              file.begin(), tokenBegin, file.end());
348
1.47k
  Token token;
349
1.47k
  lexer.LexFromRawLexer(token);
350
351
1.47k
  unsigned length = token.getLength();
352
353
  // Common case:  no need for cleaning.
354
1.47k
  if (!token.needsCleaning())
355
1.46k
    return StringRef(tokenBegin, length);
356
357
  // Hard case, we need to relex the characters into the string.
358
1
  buffer.resize(length);
359
1
  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
360
1
  return StringRef(buffer.data(), buffer.size());
361
1.47k
}
362
363
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
364
/// token are the characters used to represent the token in the source file
365
/// after trigraph expansion and escaped-newline folding.  In particular, this
366
/// wants to get the true, uncanonicalized, spelling of things like digraphs
367
/// UCNs, etc.
368
std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
369
2.92M
                               const LangOptions &LangOpts, bool *Invalid) {
370
2.92M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
371
372
0
  bool CharDataInvalid = false;
373
2.92M
  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
374
2.92M
                                                    &CharDataInvalid);
375
2.92M
  if (Invalid)
376
97
    *Invalid = CharDataInvalid;
377
2.92M
  if (CharDataInvalid)
378
0
    return {};
379
380
  // If this token contains nothing interesting, return it directly.
381
2.92M
  if (!Tok.needsCleaning())
382
2.92M
    return std::string(TokStart, TokStart + Tok.getLength());
383
384
5
  std::string Result;
385
5
  Result.resize(Tok.getLength());
386
5
  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
387
5
  return Result;
388
2.92M
}
389
390
/// getSpelling - This method is used to get the spelling of a token into a
391
/// preallocated buffer, instead of as an std::string.  The caller is required
392
/// to allocate enough space for the token, which is guaranteed to be at least
393
/// Tok.getLength() bytes long.  The actual length of the token is returned.
394
///
395
/// Note that this method may do two possible things: it may either fill in
396
/// the buffer specified with characters, or it may *change the input pointer*
397
/// to point to a constant buffer with the data already in it (avoiding a
398
/// copy).  The caller is not allowed to modify the returned buffer pointer
399
/// if an internal buffer is returned.
400
unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
401
                            const SourceManager &SourceMgr,
402
39.7M
                            const LangOptions &LangOpts, bool *Invalid) {
403
39.7M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
404
405
0
  const char *TokStart = nullptr;
406
  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
407
39.7M
  if (Tok.is(tok::raw_identifier))
408
5.19M
    TokStart = Tok.getRawIdentifier().data();
409
34.5M
  else if (!Tok.hasUCN()) {
410
34.5M
    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
411
      // Just return the string from the identifier table, which is very quick.
412
12.7M
      Buffer = II->getNameStart();
413
12.7M
      return II->getLength();
414
12.7M
    }
415
34.5M
  }
416
417
  // NOTE: this can be checked even after testing for an IdentifierInfo.
418
27.0M
  if (Tok.isLiteral())
419
21.5M
    TokStart = Tok.getLiteralData();
420
421
27.0M
  if (!TokStart) {
422
    // Compute the start of the token in the input lexer buffer.
423
420k
    bool CharDataInvalid = false;
424
420k
    TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
425
420k
    if (Invalid)
426
190k
      *Invalid = CharDataInvalid;
427
420k
    if (CharDataInvalid) {
428
0
      Buffer = "";
429
0
      return 0;
430
0
    }
431
420k
  }
432
433
  // If this token contains nothing interesting, return it directly.
434
27.0M
  if (!Tok.needsCleaning()) {
435
22.7M
    Buffer = TokStart;
436
22.7M
    return Tok.getLength();
437
22.7M
  }
438
439
  // Otherwise, hard case, relex the characters into the string.
440
4.21M
  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
441
27.0M
}
442
443
/// MeasureTokenLength - Relex the token at the specified location and return
444
/// its length in bytes in the input file.  If the token needs cleaning (e.g.
445
/// includes a trigraph or an escaped newline) then this count includes bytes
446
/// that are part of that.
447
unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
448
                                   const SourceManager &SM,
449
53.6M
                                   const LangOptions &LangOpts) {
450
53.6M
  Token TheTok;
451
53.6M
  if (getRawToken(Loc, TheTok, SM, LangOpts))
452
1.51k
    return 0;
453
53.5M
  return TheTok.getLength();
454
53.6M
}
455
456
/// Relex the token at the specified location.
457
/// \returns true if there was a failure, false on success.
458
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
459
                        const SourceManager &SM,
460
                        const LangOptions &LangOpts,
461
53.6M
                        bool IgnoreWhiteSpace) {
462
  // TODO: this could be special cased for common tokens like identifiers, ')',
463
  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
464
  // all obviously single-char tokens.  This could use
465
  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
466
  // something.
467
468
  // If this comes from a macro expansion, we really do want the macro name, not
469
  // the token this macro expanded to.
470
53.6M
  Loc = SM.getExpansionLoc(Loc);
471
53.6M
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
472
53.6M
  bool Invalid = false;
473
53.6M
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
474
53.6M
  if (Invalid)
475
16
    return true;
476
477
53.6M
  const char *StrData = Buffer.data()+LocInfo.second;
478
479
53.6M
  if (!IgnoreWhiteSpace && 
isWhitespace(StrData[0])53.6M
)
480
1.49k
    return true;
481
482
  // Create a lexer starting at the beginning of this token.
483
53.5M
  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
484
53.5M
                 Buffer.begin(), StrData, Buffer.end());
485
53.5M
  TheLexer.SetCommentRetentionState(true);
486
53.5M
  TheLexer.LexFromRawLexer(Result);
487
53.5M
  return false;
488
53.6M
}
489
490
/// Returns the pointer that points to the beginning of line that contains
491
/// the given offset, or null if the offset if invalid.
492
10.7k
static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
493
10.7k
  const char *BufStart = Buffer.data();
494
10.7k
  if (Offset >= Buffer.size())
495
6
    return nullptr;
496
497
10.7k
  const char *LexStart = BufStart + Offset;
498
284k
  for (; LexStart != BufStart; 
--LexStart273k
) {
499
283k
    if (isVerticalWhitespace(LexStart[0]) &&
500
283k
        
!Lexer::isNewLineEscaped(BufStart, LexStart)10.3k
) {
501
      // LexStart should point at first character of logical line.
502
10.3k
      ++LexStart;
503
10.3k
      break;
504
10.3k
    }
505
283k
  }
506
10.7k
  return LexStart;
507
10.7k
}
508
509
static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
510
                                              const SourceManager &SM,
511
10.5k
                                              const LangOptions &LangOpts) {
512
10.5k
  assert(Loc.isFileID());
513
0
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
514
10.5k
  if (LocInfo.first.isInvalid())
515
0
    return Loc;
516
517
10.5k
  bool Invalid = false;
518
10.5k
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
519
10.5k
  if (Invalid)
520
0
    return Loc;
521
522
  // Back up from the current location until we hit the beginning of a line
523
  // (or the buffer). We'll relex from that point.
524
10.5k
  const char *StrData = Buffer.data() + LocInfo.second;
525
10.5k
  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
526
10.5k
  if (!LexStart || 
LexStart == StrData10.5k
)
527
255
    return Loc;
528
529
  // Create a lexer starting at the beginning of this token.
530
10.2k
  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
531
10.2k
  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
532
10.2k
                 Buffer.end());
533
10.2k
  TheLexer.SetCommentRetentionState(true);
534
535
  // Lex tokens until we find the token that contains the source location.
536
10.2k
  Token TheTok;
537
16.6k
  do {
538
16.6k
    TheLexer.LexFromRawLexer(TheTok);
539
540
16.6k
    if (TheLexer.getBufferLocation() > StrData) {
541
      // Lexing this token has taken the lexer past the source location we're
542
      // looking for. If the current token encompasses our source location,
543
      // return the beginning of that token.
544
10.2k
      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
545
9.79k
        return TheTok.getLocation();
546
547
      // We ended up skipping over the source location entirely, which means
548
      // that it points into whitespace. We're done here.
549
492
      break;
550
10.2k
    }
551
16.6k
  } while (
TheTok.getKind() != tok::eof6.40k
);
552
553
  // We've passed our source location; just return the original source location.
554
492
  return Loc;
555
10.2k
}
556
557
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
558
                                          const SourceManager &SM,
559
10.5k
                                          const LangOptions &LangOpts) {
560
10.5k
  if (Loc.isFileID())
561
10.5k
    return getBeginningOfFileToken(Loc, SM, LangOpts);
562
563
20
  if (!SM.isMacroArgExpansion(Loc))
564
0
    return Loc;
565
566
20
  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
567
20
  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
568
20
  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
569
20
  std::pair<FileID, unsigned> BeginFileLocInfo =
570
20
      SM.getDecomposedLoc(BeginFileLoc);
571
20
  assert(FileLocInfo.first == BeginFileLocInfo.first &&
572
20
         FileLocInfo.second >= BeginFileLocInfo.second);
573
0
  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
574
20
}
575
576
namespace {
577
578
enum PreambleDirectiveKind {
579
  PDK_Skipped,
580
  PDK_Unknown
581
};
582
583
} // namespace
584
585
PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
586
                                      const LangOptions &LangOpts,
587
516
                                      unsigned MaxLines) {
588
  // Create a lexer starting at the beginning of the file. Note that we use a
589
  // "fake" file source location at offset 1 so that the lexer will track our
590
  // position within the file.
591
516
  const SourceLocation::UIntTy StartOffset = 1;
592
516
  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
593
516
  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
594
516
                 Buffer.end());
595
516
  TheLexer.SetCommentRetentionState(true);
596
597
516
  bool InPreprocessorDirective = false;
598
516
  Token TheTok;
599
516
  SourceLocation ActiveCommentLoc;
600
601
516
  unsigned MaxLineOffset = 0;
602
516
  if (MaxLines) {
603
84
    const char *CurPtr = Buffer.begin();
604
84
    unsigned CurLine = 0;
605
12.3k
    while (CurPtr != Buffer.end()) {
606
12.3k
      char ch = *CurPtr++;
607
12.3k
      if (ch == '\n') {
608
642
        ++CurLine;
609
642
        if (CurLine == MaxLines)
610
83
          break;
611
642
      }
612
12.3k
    }
613
84
    if (CurPtr != Buffer.end())
614
78
      MaxLineOffset = CurPtr - Buffer.begin();
615
84
  }
616
617
3.95k
  do {
618
3.95k
    TheLexer.LexFromRawLexer(TheTok);
619
620
3.95k
    if (InPreprocessorDirective) {
621
      // If we've hit the end of the file, we're done.
622
2.83k
      if (TheTok.getKind() == tok::eof) {
623
14
        break;
624
14
      }
625
626
      // If we haven't hit the end of the preprocessor directive, skip this
627
      // token.
628
2.82k
      if (!TheTok.isAtStartOfLine())
629
1.90k
        continue;
630
631
      // We've passed the end of the preprocessor directive, and will look
632
      // at this token again below.
633
920
      InPreprocessorDirective = false;
634
920
    }
635
636
    // Keep track of the # of lines in the preamble.
637
2.03k
    if (TheTok.isAtStartOfLine()) {
638
2.01k
      unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
639
640
      // If we were asked to limit the number of lines in the preamble,
641
      // and we're about to exceed that limit, we're done.
642
2.01k
      if (MaxLineOffset && 
TokOffset >= MaxLineOffset360
)
643
18
        break;
644
2.01k
    }
645
646
    // Comments are okay; skip over them.
647
2.01k
    if (TheTok.getKind() == tok::comment) {
648
601
      if (ActiveCommentLoc.isInvalid())
649
227
        ActiveCommentLoc = TheTok.getLocation();
650
601
      continue;
651
601
    }
652
653
1.41k
    if (TheTok.isAtStartOfLine() && 
TheTok.getKind() == tok::hash1.39k
) {
654
      // This is the start of a preprocessor directive.
655
934
      Token HashTok = TheTok;
656
934
      InPreprocessorDirective = true;
657
934
      ActiveCommentLoc = SourceLocation();
658
659
      // Figure out which directive this is. Since we're lexing raw tokens,
660
      // we don't have an identifier table available. Instead, just look at
661
      // the raw identifier to recognize and categorize preprocessor directives.
662
934
      TheLexer.LexFromRawLexer(TheTok);
663
934
      if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
664
934
        StringRef Keyword = TheTok.getRawIdentifier();
665
934
        PreambleDirectiveKind PDK
666
934
          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
667
934
              .Case("include", PDK_Skipped)
668
934
              .Case("__include_macros", PDK_Skipped)
669
934
              .Case("define", PDK_Skipped)
670
934
              .Case("undef", PDK_Skipped)
671
934
              .Case("line", PDK_Skipped)
672
934
              .Case("error", PDK_Skipped)
673
934
              .Case("pragma", PDK_Skipped)
674
934
              .Case("import", PDK_Skipped)
675
934
              .Case("include_next", PDK_Skipped)
676
934
              .Case("warning", PDK_Skipped)
677
934
              .Case("ident", PDK_Skipped)
678
934
              .Case("sccs", PDK_Skipped)
679
934
              .Case("assert", PDK_Skipped)
680
934
              .Case("unassert", PDK_Skipped)
681
934
              .Case("if", PDK_Skipped)
682
934
              .Case("ifdef", PDK_Skipped)
683
934
              .Case("ifndef", PDK_Skipped)
684
934
              .Case("elif", PDK_Skipped)
685
934
              .Case("elifdef", PDK_Skipped)
686
934
              .Case("elifndef", PDK_Skipped)
687
934
              .Case("else", PDK_Skipped)
688
934
              .Case("endif", PDK_Skipped)
689
934
              .Default(PDK_Unknown);
690
691
934
        switch (PDK) {
692
934
        case PDK_Skipped:
693
934
          continue;
694
695
0
        case PDK_Unknown:
696
          // We don't know what this directive is; stop at the '#'.
697
0
          break;
698
934
        }
699
934
      }
700
701
      // We only end up here if we didn't recognize the preprocessor
702
      // directive or it was one that can't occur in the preamble at this
703
      // point. Roll back the current token to the location of the '#'.
704
0
      TheTok = HashTok;
705
0
    }
706
707
    // We hit a token that we don't recognize as being in the
708
    // "preprocessing only" part of the file, so we're no longer in
709
    // the preamble.
710
484
    break;
711
3.43k
  } while (true);
712
713
516
  SourceLocation End;
714
516
  if (ActiveCommentLoc.isValid())
715
80
    End = ActiveCommentLoc; // don't truncate a decl comment.
716
436
  else
717
436
    End = TheTok.getLocation();
718
719
516
  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
720
516
                        TheTok.isAtStartOfLine());
721
516
}
722
723
unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
724
                                     const SourceManager &SM,
725
75.8k
                                     const LangOptions &LangOpts) {
726
  // Figure out how many physical characters away the specified expansion
727
  // character is.  This needs to take into consideration newlines and
728
  // trigraphs.
729
75.8k
  bool Invalid = false;
730
75.8k
  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
731
732
  // If they request the first char of the token, we're trivially done.
733
75.8k
  if (Invalid || (CharNo == 0 && 
Lexer::isObviouslySimpleCharacter(*TokPtr)2.88k
))
734
2.87k
    return 0;
735
736
72.9k
  unsigned PhysOffset = 0;
737
738
  // The usual case is that tokens don't contain anything interesting.  Skip
739
  // over the uninteresting characters.  If a token only consists of simple
740
  // chars, this method is extremely fast.
741
846k
  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
742
845k
    if (CharNo == 0)
743
71.2k
      return PhysOffset;
744
773k
    ++TokPtr;
745
773k
    --CharNo;
746
773k
    ++PhysOffset;
747
773k
  }
748
749
  // If we have a character that may be a trigraph or escaped newline, use a
750
  // lexer to parse it correctly.
751
11.8k
  
for (; 1.70k
CharNo;
--CharNo10.1k
) {
752
10.1k
    unsigned Size;
753
10.1k
    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
754
10.1k
    TokPtr += Size;
755
10.1k
    PhysOffset += Size;
756
10.1k
  }
757
758
  // Final detail: if we end up on an escaped newline, we want to return the
759
  // location of the actual byte of the token.  For example foo\<newline>bar
760
  // advanced by 3 should return the location of b, not of \\.  One compounding
761
  // detail of this is that the escape may be made by a trigraph.
762
1.70k
  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
763
1.22k
    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
764
765
1.70k
  return PhysOffset;
766
72.9k
}
767
768
/// Computes the source location just past the end of the
769
/// token at this source location.
770
///
771
/// This routine can be used to produce a source location that
772
/// points just past the end of the token referenced by \p Loc, and
773
/// is generally used when a diagnostic needs to point just after a
774
/// token where it expected something different that it received. If
775
/// the returned source location would not be meaningful (e.g., if
776
/// it points into a macro), this routine returns an invalid
777
/// source location.
778
///
779
/// \param Offset an offset from the end of the token, where the source
780
/// location should refer to. The default offset (0) produces a source
781
/// location pointing just past the end of the token; an offset of 1 produces
782
/// a source location pointing to the last character in the token, etc.
783
SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
784
                                          const SourceManager &SM,
785
4.02M
                                          const LangOptions &LangOpts) {
786
4.02M
  if (Loc.isInvalid())
787
89
    return {};
788
789
4.02M
  if (Loc.isMacroID()) {
790
1.43k
    if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
791
1.22k
      return {}; // Points inside the macro expansion.
792
1.43k
  }
793
794
4.02M
  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
795
4.02M
  if (Len > Offset)
796
4.02M
    Len = Len - Offset;
797
375
  else
798
375
    return Loc;
799
800
4.02M
  return Loc.getLocWithOffset(Len);
801
4.02M
}
802
803
/// Returns true if the given MacroID location points at the first
804
/// token of the macro expansion.
805
bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
806
                                      const SourceManager &SM,
807
                                      const LangOptions &LangOpts,
808
53.6M
                                      SourceLocation *MacroBegin) {
809
53.6M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
810
811
0
  SourceLocation expansionLoc;
812
53.6M
  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
813
29.6M
    return false;
814
815
23.9M
  if (expansionLoc.isFileID()) {
816
    // No other macro expansions, this is the first.
817
6.25M
    if (MacroBegin)
818
196
      *MacroBegin = expansionLoc;
819
6.25M
    return true;
820
6.25M
  }
821
822
17.6M
  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
823
23.9M
}
824
825
/// Returns true if the given MacroID location points at the last
826
/// token of the macro expansion.
827
bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
828
                                    const SourceManager &SM,
829
                                    const LangOptions &LangOpts,
830
49.1M
                                    SourceLocation *MacroEnd) {
831
49.1M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
832
833
0
  SourceLocation spellLoc = SM.getSpellingLoc(loc);
834
49.1M
  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
835
49.1M
  if (tokLen == 0)
836
0
    return false;
837
838
49.1M
  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
839
49.1M
  SourceLocation expansionLoc;
840
49.1M
  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
841
4.19M
    return false;
842
843
44.9M
  if (expansionLoc.isFileID()) {
844
    // No other macro expansions.
845
31.7M
    if (MacroEnd)
846
298
      *MacroEnd = expansionLoc;
847
31.7M
    return true;
848
31.7M
  }
849
850
13.2M
  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
851
44.9M
}
852
853
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
854
                                             const SourceManager &SM,
855
3.90M
                                             const LangOptions &LangOpts) {
856
3.90M
  SourceLocation Begin = Range.getBegin();
857
3.90M
  SourceLocation End = Range.getEnd();
858
3.90M
  assert(Begin.isFileID() && End.isFileID());
859
3.90M
  if (Range.isTokenRange()) {
860
3.90M
    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
861
3.90M
    if (End.isInvalid())
862
0
      return {};
863
3.90M
  }
864
865
  // Break down the source locations.
866
3.90M
  FileID FID;
867
3.90M
  unsigned BeginOffs;
868
3.90M
  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
869
3.90M
  if (FID.isInvalid())
870
0
    return {};
871
872
3.90M
  unsigned EndOffs;
873
3.90M
  if (!SM.isInFileID(End, FID, &EndOffs) ||
874
3.90M
      BeginOffs > EndOffs)
875
0
    return {};
876
877
3.90M
  return CharSourceRange::getCharRange(Begin, End);
878
3.90M
}
879
880
// Assumes that `Loc` is in an expansion.
881
static bool isInExpansionTokenRange(const SourceLocation Loc,
882
88
                                    const SourceManager &SM) {
883
88
  return SM.getSLocEntry(SM.getFileID(Loc))
884
88
      .getExpansion()
885
88
      .isExpansionTokenRange();
886
88
}
887
888
CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
889
                                         const SourceManager &SM,
890
3.90M
                                         const LangOptions &LangOpts) {
891
3.90M
  SourceLocation Begin = Range.getBegin();
892
3.90M
  SourceLocation End = Range.getEnd();
893
3.90M
  if (Begin.isInvalid() || 
End.isInvalid()3.90M
)
894
8
    return {};
895
896
3.90M
  if (Begin.isFileID() && 
End.isFileID()3.90M
)
897
3.90M
    return makeRangeFromFileLocs(Range, SM, LangOpts);
898
899
296
  if (Begin.isMacroID() && 
End.isFileID()288
) {
900
71
    if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
901
1
      return {};
902
70
    Range.setBegin(Begin);
903
70
    return makeRangeFromFileLocs(Range, SM, LangOpts);
904
71
  }
905
906
225
  if (Begin.isFileID() && 
End.isMacroID()8
) {
907
8
    if (Range.isTokenRange()) {
908
8
      if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
909
0
        return {};
910
      // Use the *original* end, not the expanded one in `End`.
911
8
      Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
912
8
    } else 
if (0
!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)0
)
913
0
      return {};
914
8
    Range.setEnd(End);
915
8
    return makeRangeFromFileLocs(Range, SM, LangOpts);
916
8
  }
917
918
217
  assert(Begin.isMacroID() && End.isMacroID());
919
0
  SourceLocation MacroBegin, MacroEnd;
920
217
  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
921
217
      
(100
(100
Range.isTokenRange()100
&& isAtEndOfMacroExpansion(End, SM, LangOpts,
922
98
                                                        &MacroEnd)) ||
923
100
       
(20
Range.isCharRange()20
&& isAtStartOfMacroExpansion(End, SM, LangOpts,
924
80
                                                         &MacroEnd)))) {
925
80
    Range.setBegin(MacroBegin);
926
80
    Range.setEnd(MacroEnd);
927
    // Use the *original* `End`, not the expanded one in `MacroEnd`.
928
80
    if (Range.isTokenRange())
929
80
      Range.setTokenRange(isInExpansionTokenRange(End, SM));
930
80
    return makeRangeFromFileLocs(Range, SM, LangOpts);
931
80
  }
932
933
137
  bool Invalid = false;
934
137
  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
935
137
                                                        &Invalid);
936
137
  if (Invalid)
937
0
    return {};
938
939
137
  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
940
116
    const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
941
116
                                                        &Invalid);
942
116
    if (Invalid)
943
0
      return {};
944
945
116
    if (EndEntry.getExpansion().isMacroArgExpansion() &&
946
116
        BeginEntry.getExpansion().getExpansionLocStart() ==
947
115
            EndEntry.getExpansion().getExpansionLocStart()) {
948
114
      Range.setBegin(SM.getImmediateSpellingLoc(Begin));
949
114
      Range.setEnd(SM.getImmediateSpellingLoc(End));
950
114
      return makeFileCharRange(Range, SM, LangOpts);
951
114
    }
952
116
  }
953
954
23
  return {};
955
137
}
956
957
StringRef Lexer::getSourceText(CharSourceRange Range,
958
                               const SourceManager &SM,
959
                               const LangOptions &LangOpts,
960
3.90M
                               bool *Invalid) {
961
3.90M
  Range = makeFileCharRange(Range, SM, LangOpts);
962
3.90M
  if (Range.isInvalid()) {
963
9
    if (Invalid) 
*Invalid = true2
;
964
9
    return {};
965
9
  }
966
967
  // Break down the source location.
968
3.90M
  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
969
3.90M
  if (beginInfo.first.isInvalid()) {
970
0
    if (Invalid) *Invalid = true;
971
0
    return {};
972
0
  }
973
974
3.90M
  unsigned EndOffs;
975
3.90M
  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
976
3.90M
      beginInfo.second > EndOffs) {
977
0
    if (Invalid) *Invalid = true;
978
0
    return {};
979
0
  }
980
981
  // Try to the load the file buffer.
982
3.90M
  bool invalidTemp = false;
983
3.90M
  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
984
3.90M
  if (invalidTemp) {
985
0
    if (Invalid) *Invalid = true;
986
0
    return {};
987
0
  }
988
989
3.90M
  if (Invalid) 
*Invalid = false264
;
990
3.90M
  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
991
3.90M
}
992
993
StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
994
                                       const SourceManager &SM,
995
6.33k
                                       const LangOptions &LangOpts) {
996
6.33k
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
997
998
  // Find the location of the immediate macro expansion.
999
20.6k
  while (true) {
1000
20.6k
    FileID FID = SM.getFileID(Loc);
1001
20.6k
    const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1002
20.6k
    const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1003
20.6k
    Loc = Expansion.getExpansionLocStart();
1004
20.6k
    if (!Expansion.isMacroArgExpansion())
1005
4.17k
      break;
1006
1007
    // For macro arguments we need to check that the argument did not come
1008
    // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1009
1010
    // Loc points to the argument id of the macro definition, move to the
1011
    // macro expansion.
1012
16.4k
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1013
16.4k
    SourceLocation SpellLoc = Expansion.getSpellingLoc();
1014
16.4k
    if (SpellLoc.isFileID())
1015
2.16k
      break; // No inner macro.
1016
1017
    // If spelling location resides in the same FileID as macro expansion
1018
    // location, it means there is no inner macro.
1019
14.3k
    FileID MacroFID = SM.getFileID(Loc);
1020
14.3k
    if (SM.isInFileID(SpellLoc, MacroFID))
1021
1
      break;
1022
1023
    // Argument came from inner macro.
1024
14.3k
    Loc = SpellLoc;
1025
14.3k
  }
1026
1027
  // Find the spelling location of the start of the non-argument expansion
1028
  // range. This is where the macro name was spelled in order to begin
1029
  // expanding this macro.
1030
6.33k
  Loc = SM.getSpellingLoc(Loc);
1031
1032
  // Dig out the buffer where the macro name was spelled and the extents of the
1033
  // name so that we can render it into the expansion note.
1034
6.33k
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1035
6.33k
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1036
6.33k
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1037
6.33k
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1038
6.33k
}
1039
1040
StringRef Lexer::getImmediateMacroNameForDiagnostics(
1041
653
    SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1042
653
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1043
  // Walk past macro argument expansions.
1044
653
  while (SM.isMacroArgExpansion(Loc))
1045
0
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1046
1047
  // If the macro's spelling has no FileID, then it's actually a token paste
1048
  // or stringization (or similar) and not a macro at all.
1049
653
  if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1050
63
    return {};
1051
1052
  // Find the spelling location of the start of the non-argument expansion
1053
  // range. This is where the macro name was spelled in order to begin
1054
  // expanding this macro.
1055
590
  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1056
1057
  // Dig out the buffer where the macro name was spelled and the extents of the
1058
  // name so that we can render it into the expansion note.
1059
590
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1060
590
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1061
590
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1062
590
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1063
653
}
1064
1065
1.38k
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1066
1.38k
  return isIdentifierBody(c, LangOpts.DollarIdents);
1067
1.38k
}
1068
1069
10.4k
bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1070
10.4k
  assert(isVerticalWhitespace(Str[0]));
1071
10.4k
  if (Str - 1 < BufferStart)
1072
2
    return false;
1073
1074
10.4k
  if ((Str[0] == '\n' && 
Str[-1] == '\r'10.3k
) ||
1075
10.4k
      
(10.3k
Str[0] == '\r'10.3k
&&
Str[-1] == '\n'18
)) {
1076
18
    if (Str - 2 < BufferStart)
1077
2
      return false;
1078
16
    --Str;
1079
16
  }
1080
10.4k
  --Str;
1081
1082
  // Rewind to first non-space character:
1083
10.5k
  while (Str > BufferStart && 
isHorizontalWhitespace(*Str)10.5k
)
1084
106
    --Str;
1085
1086
10.4k
  return *Str == '\\';
1087
10.4k
}
1088
1089
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1090
174
                                       const SourceManager &SM) {
1091
174
  if (Loc.isInvalid() || Loc.isMacroID())
1092
0
    return {};
1093
174
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1094
174
  if (LocInfo.first.isInvalid())
1095
0
    return {};
1096
174
  bool Invalid = false;
1097
174
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1098
174
  if (Invalid)
1099
0
    return {};
1100
174
  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1101
174
  if (!Line)
1102
0
    return {};
1103
174
  StringRef Rest = Buffer.substr(Line - Buffer.data());
1104
174
  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1105
174
  return NumWhitespaceChars == StringRef::npos
1106
174
             ? 
""0
1107
174
             : Rest.take_front(NumWhitespaceChars);
1108
174
}
1109
1110
//===----------------------------------------------------------------------===//
1111
// Diagnostics forwarding code.
1112
//===----------------------------------------------------------------------===//
1113
1114
/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1115
/// lexer buffer was all expanded at a single point, perform the mapping.
1116
/// This is currently only used for _Pragma implementation, so it is the slow
1117
/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1118
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1119
    Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1120
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1121
                                        SourceLocation FileLoc,
1122
3.38M
                                        unsigned CharNo, unsigned TokLen) {
1123
3.38M
  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1124
1125
  // Otherwise, we're lexing "mapped tokens".  This is used for things like
1126
  // _Pragma handling.  Combine the expansion location of FileLoc with the
1127
  // spelling location.
1128
0
  SourceManager &SM = PP.getSourceManager();
1129
1130
  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1131
  // characters come from spelling(FileLoc)+Offset.
1132
3.38M
  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1133
3.38M
  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1134
1135
  // Figure out the expansion loc range, which is the range covered by the
1136
  // original _Pragma(...) sequence.
1137
3.38M
  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1138
1139
3.38M
  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1140
3.38M
}
1141
1142
/// getSourceLocation - Return a source location identifier for the specified
1143
/// offset in the current file.
1144
SourceLocation Lexer::getSourceLocation(const char *Loc,
1145
2.58G
                                        unsigned TokLen) const {
1146
2.58G
  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1147
2.58G
         "Location out of range for this buffer!");
1148
1149
  // In the normal case, we're just lexing from a simple file buffer, return
1150
  // the file id from FileLoc with the offset specified.
1151
0
  unsigned CharNo = Loc-BufferStart;
1152
2.58G
  if (FileLoc.isFileID())
1153
2.57G
    return FileLoc.getLocWithOffset(CharNo);
1154
1155
  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1156
  // tokens are lexed from where the _Pragma was defined.
1157
3.38M
  assert(PP && "This doesn't work on raw lexers");
1158
0
  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1159
2.58G
}
1160
1161
/// Diag - Forwarding function for diagnostics.  This translate a source
1162
/// position in the current buffer into a SourceLocation object for rendering.
1163
93.1k
DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1164
93.1k
  return PP->Diag(getSourceLocation(Loc), DiagID);
1165
93.1k
}
1166
1167
//===----------------------------------------------------------------------===//
1168
// Trigraph and Escaped Newline Handling Code.
1169
//===----------------------------------------------------------------------===//
1170
1171
/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1172
/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1173
276
static char GetTrigraphCharForLetter(char Letter) {
1174
276
  switch (Letter) {
1175
123
  default:   return 0;
1176
41
  case '=':  return '#';
1177
31
  case ')':  return ']';
1178
30
  case '(':  return '[';
1179
6
  case '!':  return '|';
1180
1
  case '\'': return '^';
1181
4
  case '>':  return '}';
1182
34
  case '/':  return '\\';
1183
3
  case '<':  return '{';
1184
3
  case '-':  return '~';
1185
276
  }
1186
276
}
1187
1188
/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1189
/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1190
/// return the result character.  Finally, emit a warning about trigraph use
1191
/// whether trigraphs are enabled or not.
1192
248
static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1193
248
  char Res = GetTrigraphCharForLetter(*CP);
1194
248
  if (!Res || 
!L125
)
return Res140
;
1195
1196
108
  if (!L->getLangOpts().Trigraphs) {
1197
51
    if (!L->isLexingRawMode())
1198
26
      L->Diag(CP-2, diag::trigraph_ignored);
1199
51
    return 0;
1200
51
  }
1201
1202
57
  if (!L->isLexingRawMode())
1203
46
    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1204
57
  return Res;
1205
108
}
1206
1207
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1208
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1209
/// trigraph equivalent on entry to this function.
1210
18.5M
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1211
18.5M
  unsigned Size = 0;
1212
18.5M
  while (isWhitespace(Ptr[Size])) {
1213
18.5M
    ++Size;
1214
1215
18.5M
    if (Ptr[Size-1] != '\n' && 
Ptr[Size-1] != '\r'327
)
1216
205
      continue;
1217
1218
    // If this is a \r\n or \n\r, skip the other half.
1219
18.5M
    if ((Ptr[Size] == '\r' || 
Ptr[Size] == '\n'18.5M
) &&
1220
18.5M
        
Ptr[Size-1] != Ptr[Size]158
)
1221
122
      ++Size;
1222
1223
18.5M
    return Size;
1224
18.5M
  }
1225
1226
  // Not an escaped newline, must be a \t or something else.
1227
1.23k
  return 0;
1228
18.5M
}
1229
1230
/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1231
/// them), skip over them and return the first non-escaped-newline found,
1232
/// otherwise return P.
1233
1.22k
const char *Lexer::SkipEscapedNewLines(const char *P) {
1234
1.23k
  while (true) {
1235
1.23k
    const char *AfterEscape;
1236
1.23k
    if (*P == '\\') {
1237
1.22k
      AfterEscape = P+1;
1238
1.22k
    } else 
if (15
*P == '?'15
) {
1239
      // If not a trigraph for escape, bail out.
1240
1
      if (P[1] != '?' || 
P[2] != '/'0
)
1241
1
        return P;
1242
      // FIXME: Take LangOpts into account; the language might not
1243
      // support trigraphs.
1244
0
      AfterEscape = P+3;
1245
14
    } else {
1246
14
      return P;
1247
14
    }
1248
1249
1.22k
    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1250
1.22k
    if (NewLineSize == 0) 
return P1.20k
;
1251
14
    P = AfterEscape+NewLineSize;
1252
14
  }
1253
1.22k
}
1254
1255
Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1256
                                     const SourceManager &SM,
1257
519
                                     const LangOptions &LangOpts) {
1258
519
  if (Loc.isMacroID()) {
1259
2
    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1260
2
      return None;
1261
2
  }
1262
517
  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1263
1264
  // Break down the source location.
1265
517
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1266
1267
  // Try to load the file buffer.
1268
517
  bool InvalidTemp = false;
1269
517
  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1270
517
  if (InvalidTemp)
1271
0
    return None;
1272
1273
517
  const char *TokenBegin = File.data() + LocInfo.second;
1274
1275
  // Lex from the start of the given location.
1276
517
  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1277
517
                                      TokenBegin, File.end());
1278
  // Find the token.
1279
517
  Token Tok;
1280
517
  lexer.LexFromRawLexer(Tok);
1281
517
  return Tok;
1282
517
}
1283
1284
/// Checks that the given token is the first token that occurs after the
1285
/// given location (this excludes comments and whitespace). Returns the location
1286
/// immediately after the specified token. If the token is not found or the
1287
/// location is inside a macro, the returned source location will be invalid.
1288
SourceLocation Lexer::findLocationAfterToken(
1289
    SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1290
483
    const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1291
483
  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1292
483
  if (!Tok || 
Tok->isNot(TKind)481
)
1293
60
    return {};
1294
423
  SourceLocation TokenLoc = Tok->getLocation();
1295
1296
  // Calculate how much whitespace needs to be skipped if any.
1297
423
  unsigned NumWhitespaceChars = 0;
1298
423
  if (SkipTrailingWhitespaceAndNewLine) {
1299
235
    const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1300
235
    unsigned char C = *TokenEnd;
1301
463
    while (isHorizontalWhitespace(C)) {
1302
228
      C = *(++TokenEnd);
1303
228
      NumWhitespaceChars++;
1304
228
    }
1305
1306
    // Skip \r, \n, \r\n, or \n\r
1307
235
    if (C == '\n' || 
C == '\r'132
) {
1308
104
      char PrevC = C;
1309
104
      C = *(++TokenEnd);
1310
104
      NumWhitespaceChars++;
1311
104
      if ((C == '\n' || 
C == '\r'103
) &&
C != PrevC1
)
1312
1
        NumWhitespaceChars++;
1313
104
    }
1314
235
  }
1315
1316
423
  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1317
483
}
1318
1319
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1320
/// get its size, and return it.  This is tricky in several cases:
1321
///   1. If currently at the start of a trigraph, we warn about the trigraph,
1322
///      then either return the trigraph (skipping 3 chars) or the '?',
1323
///      depending on whether trigraphs are enabled or not.
1324
///   2. If this is an escaped newline (potentially with whitespace between
1325
///      the backslash and newline), implicitly skip the newline and return
1326
///      the char after it.
1327
///
1328
/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1329
/// know that we can accumulate into Size, and that we have already incremented
1330
/// Ptr by Size bytes.
1331
///
1332
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1333
/// be updated to match.
1334
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1335
29.1M
                               Token *Tok) {
1336
  // If we have a slash, look for an escaped newline.
1337
29.1M
  if (Ptr[0] == '\\') {
1338
14.6M
    ++Size;
1339
14.6M
    ++Ptr;
1340
14.6M
Slash:
1341
    // Common case, backslash-char where the char is not whitespace.
1342
14.6M
    if (!isWhitespace(Ptr[0])) 
return '\\'257k
;
1343
1344
    // See if we have optional whitespace characters between the slash and
1345
    // newline.
1346
14.3M
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1347
      // Remember that this token needs to be cleaned.
1348
14.3M
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)14.3M
;
1349
1350
      // Warn if there was whitespace between the backslash and newline.
1351
14.3M
      if (Ptr[0] != '\n' && 
Ptr[0] != '\r'143
&&
Tok23
&&
!isLexingRawMode()16
)
1352
8
        Diag(Ptr, diag::backslash_newline_space);
1353
1354
      // Found backslash<whitespace><newline>.  Parse the char after it.
1355
14.3M
      Size += EscapedNewLineSize;
1356
14.3M
      Ptr  += EscapedNewLineSize;
1357
1358
      // Use slow version to accumulate a correct size field.
1359
14.3M
      return getCharAndSizeSlow(Ptr, Size, Tok);
1360
14.3M
    }
1361
1362
    // Otherwise, this is not an escaped newline, just return the slash.
1363
31
    return '\\';
1364
14.3M
  }
1365
1366
  // If this is a trigraph, process it.
1367
14.5M
  if (Ptr[0] == '?' && 
Ptr[1] == '?'194k
) {
1368
    // If this is actually a legal trigraph (not something like "??x"), emit
1369
    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1370
248
    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1371
      // Remember that this token needs to be cleaned.
1372
74
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)57
;
1373
1374
74
      Ptr += 3;
1375
74
      Size += 3;
1376
74
      if (C == '\\') 
goto Slash18
;
1377
56
      return C;
1378
74
    }
1379
248
  }
1380
1381
  // If this is neither, return a single character.
1382
14.5M
  ++Size;
1383
14.5M
  return *Ptr;
1384
14.5M
}
1385
1386
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1387
/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1388
/// and that we have already incremented Ptr by Size bytes.
1389
///
1390
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1391
/// be updated to match.
1392
char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1393
8.44M
                                     const LangOptions &LangOpts) {
1394
  // If we have a slash, look for an escaped newline.
1395
8.44M
  if (Ptr[0] == '\\') {
1396
4.22M
    ++Size;
1397
4.22M
    ++Ptr;
1398
4.22M
Slash:
1399
    // Common case, backslash-char where the char is not whitespace.
1400
4.22M
    if (!isWhitespace(Ptr[0])) 
return '\\'1.69k
;
1401
1402
    // See if we have optional whitespace characters followed by a newline.
1403
4.21M
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1404
      // Found backslash<whitespace><newline>.  Parse the char after it.
1405
4.21M
      Size += EscapedNewLineSize;
1406
4.21M
      Ptr  += EscapedNewLineSize;
1407
1408
      // Use slow version to accumulate a correct size field.
1409
4.21M
      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1410
4.21M
    }
1411
1412
    // Otherwise, this is not an escaped newline, just return the slash.
1413
0
    return '\\';
1414
4.21M
  }
1415
1416
  // If this is a trigraph, process it.
1417
4.21M
  if (LangOpts.Trigraphs && 
Ptr[0] == '?'14.3k
&&
Ptr[1] == '?'28
) {
1418
    // If this is actually a legal trigraph (not something like "??x"), return
1419
    // it.
1420
28
    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1421
28
      Ptr += 3;
1422
28
      Size += 3;
1423
28
      if (C == '\\') 
goto Slash8
;
1424
20
      return C;
1425
28
    }
1426
28
  }
1427
1428
  // If this is neither, return a single character.
1429
4.21M
  ++Size;
1430
4.21M
  return *Ptr;
1431
4.21M
}
1432
1433
//===----------------------------------------------------------------------===//
1434
// Helper methods for lexing.
1435
//===----------------------------------------------------------------------===//
1436
1437
/// Routine that indiscriminately sets the offset into the source file.
1438
421
void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1439
421
  BufferPtr = BufferStart + Offset;
1440
421
  if (BufferPtr > BufferEnd)
1441
0
    BufferPtr = BufferEnd;
1442
  // FIXME: What exactly does the StartOfLine bit mean?  There are two
1443
  // possible meanings for the "start" of the line: the first token on the
1444
  // unexpanded line, or the first token on the expanded line.
1445
421
  IsAtStartOfLine = StartOfLine;
1446
421
  IsAtPhysicalStartOfLine = StartOfLine;
1447
421
}
1448
1449
212
static bool isUnicodeWhitespace(uint32_t Codepoint) {
1450
212
  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1451
212
      UnicodeWhitespaceCharRanges);
1452
212
  return UnicodeWhitespaceChars.contains(Codepoint);
1453
212
}
1454
1455
537
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1456
537
  if (LangOpts.AsmPreprocessor) {
1457
2
    return false;
1458
535
  } else if (LangOpts.DollarIdents && '$' == C) {
1459
3
    return true;
1460
532
  } else if (LangOpts.CPlusPlus) {
1461
    // A non-leading codepoint must have the XID_Continue property.
1462
    // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1463
    // so we need to check both tables.
1464
    // '_' doesn't have the XID_Continue property but is allowed in C++.
1465
168
    static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1466
168
    static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1467
168
    return C == '_' || XIDStartChars.contains(C) ||
1468
168
           
XIDContinueChars.contains(C)60
;
1469
364
  } else if (LangOpts.C11) {
1470
223
    static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1471
223
        C11AllowedIDCharRanges);
1472
223
    return C11AllowedIDChars.contains(C);
1473
223
  } else {
1474
141
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1475
141
        C99AllowedIDCharRanges);
1476
141
    return C99AllowedIDChars.contains(C);
1477
141
  }
1478
537
}
1479
1480
252
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1481
252
  if (LangOpts.AsmPreprocessor) {
1482
0
    return false;
1483
0
  }
1484
252
  if (LangOpts.CPlusPlus) {
1485
105
    static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1486
    // '_' doesn't have the XID_Start property but is allowed in C++.
1487
105
    return C == '_' || XIDStartChars.contains(C);
1488
105
  }
1489
147
  if (!isAllowedIDChar(C, LangOpts))
1490
59
    return false;
1491
88
  if (LangOpts.C11) {
1492
65
    static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1493
65
        C11DisallowedInitialIDCharRanges);
1494
65
    return !C11DisallowedInitialIDChars.contains(C);
1495
65
  }
1496
23
  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1497
23
      C99DisallowedInitialIDCharRanges);
1498
23
  return !C99DisallowedInitialIDChars.contains(C);
1499
88
}
1500
1501
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1502
579
                                            const char *End) {
1503
579
  return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1504
579
                                       L.getSourceLocation(End));
1505
579
}
1506
1507
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1508
321
                                      CharSourceRange Range, bool IsFirst) {
1509
  // Check C99 compatibility.
1510
321
  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1511
12
    enum {
1512
12
      CannotAppearInIdentifier = 0,
1513
12
      CannotStartIdentifier
1514
12
    };
1515
1516
12
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1517
12
        C99AllowedIDCharRanges);
1518
12
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1519
12
        C99DisallowedInitialIDCharRanges);
1520
12
    if (!C99AllowedIDChars.contains(C)) {
1521
5
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1522
5
        << Range
1523
5
        << CannotAppearInIdentifier;
1524
7
    } else if (IsFirst && 
C99DisallowedInitialIDChars.contains(C)3
) {
1525
2
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1526
2
        << Range
1527
2
        << CannotStartIdentifier;
1528
2
    }
1529
12
  }
1530
321
}
1531
1532
/// After encountering UTF-8 character C and interpreting it as an identifier
1533
/// character, check whether it's a homoglyph for a common non-identifier
1534
/// source character that is unlikely to be an intentional identifier
1535
/// character and warn if so.
1536
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1537
212
                                       CharSourceRange Range) {
1538
  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1539
212
  struct HomoglyphPair {
1540
212
    uint32_t Character;
1541
212
    char LooksLike;
1542
1.25k
    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1543
212
  };
1544
212
  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1545
212
    {U'\u00ad', 0},   // SOFT HYPHEN
1546
212
    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1547
212
    {U'\u037e', ';'}, // GREEK QUESTION MARK
1548
212
    {U'\u200b', 0},   // ZERO WIDTH SPACE
1549
212
    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1550
212
    {U'\u200d', 0},   // ZERO WIDTH JOINER
1551
212
    {U'\u2060', 0},   // WORD JOINER
1552
212
    {U'\u2061', 0},   // FUNCTION APPLICATION
1553
212
    {U'\u2062', 0},   // INVISIBLE TIMES
1554
212
    {U'\u2063', 0},   // INVISIBLE SEPARATOR
1555
212
    {U'\u2064', 0},   // INVISIBLE PLUS
1556
212
    {U'\u2212', '-'}, // MINUS SIGN
1557
212
    {U'\u2215', '/'}, // DIVISION SLASH
1558
212
    {U'\u2216', '\\'}, // SET MINUS
1559
212
    {U'\u2217', '*'}, // ASTERISK OPERATOR
1560
212
    {U'\u2223', '|'}, // DIVIDES
1561
212
    {U'\u2227', '^'}, // LOGICAL AND
1562
212
    {U'\u2236', ':'}, // RATIO
1563
212
    {U'\u223c', '~'}, // TILDE OPERATOR
1564
212
    {U'\ua789', ':'}, // MODIFIER LETTER COLON
1565
212
    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1566
212
    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1567
212
    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1568
212
    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1569
212
    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1570
212
    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1571
212
    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1572
212
    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1573
212
    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1574
212
    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1575
212
    {U'\uff0c', ','}, // FULLWIDTH COMMA
1576
212
    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1577
212
    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1578
212
    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1579
212
    {U'\uff1a', ':'}, // FULLWIDTH COLON
1580
212
    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1581
212
    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1582
212
    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1583
212
    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1584
212
    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1585
212
    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1586
212
    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1587
212
    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1588
212
    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1589
212
    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1590
212
    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1591
212
    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1592
212
    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1593
212
    {U'\uff5e', '~'}, // FULLWIDTH TILDE
1594
212
    {0, 0}
1595
212
  };
1596
212
  auto Homoglyph =
1597
212
      std::lower_bound(std::begin(SortedHomoglyphs),
1598
212
                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1599
212
  if (Homoglyph->Character == C) {
1600
39
    llvm::SmallString<5> CharBuf;
1601
39
    {
1602
39
      llvm::raw_svector_ostream CharOS(CharBuf);
1603
39
      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1604
39
    }
1605
39
    if (Homoglyph->LooksLike) {
1606
32
      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1607
32
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1608
32
          << Range << CharBuf << LooksLikeStr;
1609
32
    } else {
1610
7
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1611
7
          << Range << CharBuf;
1612
7
    }
1613
39
  }
1614
212
}
1615
1616
static void diagnoseInvalidUnicodeCodepointInIdentifier(
1617
    DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1618
37
    CharSourceRange Range, bool IsFirst) {
1619
37
  if (isASCII(CodePoint))
1620
0
    return;
1621
1622
37
  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts);
1623
37
  bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts);
1624
1625
37
  if ((IsFirst && 
IsIDStart13
) || (!IsFirst &&
IsIDContinue24
))
1626
0
    return;
1627
1628
37
  bool InvalidOnlyAtStart = IsFirst && 
!IsIDStart13
&&
IsIDContinue13
;
1629
1630
37
  llvm::SmallString<5> CharBuf;
1631
37
  llvm::raw_svector_ostream CharOS(CharBuf);
1632
37
  llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4);
1633
1634
37
  if (!IsFirst || 
InvalidOnlyAtStart13
) {
1635
30
    Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1636
30
        << Range << CharBuf << int(InvalidOnlyAtStart)
1637
30
        << FixItHint::CreateRemoval(Range);
1638
30
  } else {
1639
7
    Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1640
7
        << Range << CharBuf << FixItHint::CreateRemoval(Range);
1641
7
  }
1642
37
}
1643
1644
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1645
148
                                    Token &Result) {
1646
148
  const char *UCNPtr = CurPtr + Size;
1647
148
  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1648
148
  if (CodePoint == 0) {
1649
29
    return false;
1650
29
  }
1651
1652
119
  if (!isAllowedIDChar(CodePoint, LangOpts)) {
1653
10
    if (isASCII(CodePoint) || 
isUnicodeWhitespace(CodePoint)8
)
1654
2
      return false;
1655
8
    if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1656
8
        !PP->isPreprocessedOutput())
1657
8
      diagnoseInvalidUnicodeCodepointInIdentifier(
1658
8
          PP->getDiagnostics(), LangOpts, CodePoint,
1659
8
          makeCharRange(*this, CurPtr, UCNPtr),
1660
8
          /*IsFirst=*/false);
1661
1662
    // We got a unicode codepoint that is neither a space nor a
1663
    // a valid identifier part.
1664
    // Carry on as if the codepoint was valid for recovery purposes.
1665
109
  } else if (!isLexingRawMode())
1666
109
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1667
109
                              makeCharRange(*this, CurPtr, UCNPtr),
1668
109
                              /*IsFirst=*/false);
1669
1670
117
  Result.setFlag(Token::HasUCN);
1671
117
  if ((UCNPtr - CurPtr ==  6 && 
CurPtr[1] == 'u'91
) ||
1672
117
      
(26
UCNPtr - CurPtr == 1026
&&
CurPtr[1] == 'U'17
))
1673
102
    CurPtr = UCNPtr;
1674
15
  else
1675
105
    
while (15
CurPtr != UCNPtr)
1676
90
      (void)getAndAdvanceChar(CurPtr, Result);
1677
117
  return true;
1678
119
}
1679
1680
249
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1681
249
  const char *UnicodePtr = CurPtr;
1682
249
  llvm::UTF32 CodePoint;
1683
249
  llvm::ConversionResult Result =
1684
249
      llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1685
249
                                (const llvm::UTF8 *)BufferEnd,
1686
249
                                &CodePoint,
1687
249
                                llvm::strictConversion);
1688
249
  if (Result != llvm::conversionOK)
1689
15
    return false;
1690
1691
234
  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) {
1692
67
    if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1693
8
      return false;
1694
1695
59
    if (!isLexingRawMode() && 
!ParsingPreprocessorDirective35
&&
1696
59
        
!PP->isPreprocessedOutput()35
)
1697
16
      diagnoseInvalidUnicodeCodepointInIdentifier(
1698
16
          PP->getDiagnostics(), LangOpts, CodePoint,
1699
16
          makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false);
1700
    // We got a unicode codepoint that is neither a space nor a
1701
    // a valid identifier part. Carry on as if the codepoint was
1702
    // valid for recovery purposes.
1703
167
  } else if (!isLexingRawMode()) {
1704
139
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1705
139
                              makeCharRange(*this, CurPtr, UnicodePtr),
1706
139
                              /*IsFirst=*/false);
1707
139
    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1708
139
                               makeCharRange(*this, CurPtr, UnicodePtr));
1709
139
  }
1710
1711
226
  CurPtr = UnicodePtr;
1712
226
  return true;
1713
234
}
1714
1715
1.02G
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1716
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1717
1.02G
  unsigned Size;
1718
1.02G
  unsigned char C = *CurPtr++;
1719
10.5G
  while (isIdentifierBody(C))
1720
9.53G
    C = *CurPtr++;
1721
1722
1.02G
  --CurPtr;   // Back up over the skipped character.
1723
1724
  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1725
  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1726
  //
1727
  // TODO: Could merge these checks into an InfoTable flag to make the
1728
  // comparison cheaper
1729
1.02G
  if (
isASCII(C)1.02G
&& C != '\\' &&
C != '?'1.02G
&&
1730
1.02G
      
(1.02G
C != '$'1.02G
||
!LangOpts.DollarIdents2.56k
)) {
1731
1.02G
FinishIdentifier:
1732
1.02G
    const char *IdStart = BufferPtr;
1733
1.02G
    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1734
1.02G
    Result.setRawIdentifierData(IdStart);
1735
1736
    // If we are in raw mode, return this identifier raw.  There is no need to
1737
    // look up identifier information or attempt to macro expand it.
1738
1.02G
    if (LexingRawMode)
1739
409M
      return true;
1740
1741
    // Fill in Result.IdentifierInfo and update the token kind,
1742
    // looking up the identifier in the identifier table.
1743
616M
    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1744
    // Note that we have to call PP->LookUpIdentifierInfo() even for code
1745
    // completion, it writes IdentifierInfo into Result, and callers rely on it.
1746
1747
    // If the completion point is at the end of an identifier, we want to treat
1748
    // the identifier as incomplete even if it resolves to a macro or a keyword.
1749
    // This allows e.g. 'class^' to complete to 'classifier'.
1750
616M
    if (isCodeCompletionPoint(CurPtr)) {
1751
      // Return the code-completion token.
1752
85
      Result.setKind(tok::code_completion);
1753
      // Skip the code-completion char and all immediate identifier characters.
1754
      // This ensures we get consistent behavior when completing at any point in
1755
      // an identifier (i.e. at the start, in the middle, at the end). Note that
1756
      // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1757
      // simpler.
1758
85
      assert(*CurPtr == 0 && "Completion character must be 0");
1759
0
      ++CurPtr;
1760
      // Note that code completion token is not added as a separate character
1761
      // when the completion point is at the end of the buffer. Therefore, we need
1762
      // to check if the buffer has ended.
1763
85
      if (CurPtr < BufferEnd) {
1764
131
        while (isIdentifierBody(*CurPtr))
1765
47
          ++CurPtr;
1766
84
      }
1767
85
      BufferPtr = CurPtr;
1768
85
      return true;
1769
85
    }
1770
1771
    // Finally, now that we know we have an identifier, pass this off to the
1772
    // preprocessor, which may macro expand it or something.
1773
616M
    if (II->isHandleIdentifierCase())
1774
45.5M
      return PP->HandleIdentifier(Result);
1775
1776
570M
    return true;
1777
616M
  }
1778
1779
  // Otherwise, $,\,? in identifier found.  Enter slower path.
1780
1781
2.81k
  C = getCharAndSize(CurPtr, Size);
1782
11.8k
  while (
true9.09k
) {
1783
11.8k
    if (C == '$') {
1784
      // If we hit a $ and they are not supported in identifiers, we are done.
1785
2.94k
      if (!LangOpts.DollarIdents) 
goto FinishIdentifier0
;
1786
1787
      // Otherwise, emit a diagnostic and continue.
1788
2.94k
      if (!isLexingRawMode())
1789
2.94k
        Diag(CurPtr, diag::ext_dollar_in_identifier);
1790
2.94k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1791
2.94k
      C = getCharAndSize(CurPtr, Size);
1792
2.94k
      continue;
1793
8.89k
    } else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)117
) {
1794
94
      C = getCharAndSize(CurPtr, Size);
1795
94
      continue;
1796
8.79k
    } else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)220
) {
1797
197
      C = getCharAndSize(CurPtr, Size);
1798
197
      continue;
1799
8.60k
    } else if (!isIdentifierBody(C)) {
1800
5.55k
      goto FinishIdentifier;
1801
5.55k
    }
1802
1803
    // Otherwise, this character is good, consume it.
1804
3.04k
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1805
1806
3.04k
    C = getCharAndSize(CurPtr, Size);
1807
33.9k
    while (isIdentifierBody(C)) {
1808
30.9k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1809
30.9k
      C = getCharAndSize(CurPtr, Size);
1810
30.9k
    }
1811
3.04k
  }
1812
2.81k
}
1813
1814
/// isHexaLiteral - Return true if Start points to a hex constant.
1815
/// in microsoft mode (where this is supposed to be several different tokens).
1816
129k
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1817
129k
  unsigned Size;
1818
129k
  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1819
129k
  if (C1 != '0')
1820
128k
    return false;
1821
1.62k
  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1822
1.62k
  return (C2 == 'x' || 
C2 == 'X'15
);
1823
129k
}
1824
1825
/// LexNumericConstant - Lex the remainder of a integer or floating point
1826
/// constant. From[-1] is the first character lexed.  Return the end of the
1827
/// constant.
1828
61.2M
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1829
61.2M
  unsigned Size;
1830
61.2M
  char C = getCharAndSize(CurPtr, Size);
1831
61.2M
  char PrevCh = 0;
1832
207M
  while (isPreprocessingNumberBody(C)) {
1833
146M
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1834
146M
    PrevCh = C;
1835
146M
    C = getCharAndSize(CurPtr, Size);
1836
146M
  }
1837
1838
  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1839
61.2M
  if ((C == '-' || 
C == '+'60.3M
) &&
(1.25M
PrevCh == 'E'1.25M
||
PrevCh == 'e'1.25M
)) {
1840
    // If we are in Microsoft mode, don't continue if the constant is hex.
1841
    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1842
1.14M
    if (!LangOpts.MicrosoftExt || 
!isHexaLiteral(BufferPtr, LangOpts)128k
)
1843
1.14M
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1844
1.14M
  }
1845
1846
  // If we have a hex FP constant, continue.
1847
60.1M
  if ((C == '-' || 
C == '+'60.0M
) &&
(106k
PrevCh == 'P'106k
||
PrevCh == 'p'106k
)) {
1848
    // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1849
    // not-quite-conforming extension. Only do so if this looks like it's
1850
    // actually meant to be a hexfloat, and not if it has a ud-suffix.
1851
7.29k
    bool IsHexFloat = true;
1852
7.29k
    if (!LangOpts.C99) {
1853
1.61k
      if (!isHexaLiteral(BufferPtr, LangOpts))
1854
9
        IsHexFloat = false;
1855
1.60k
      else if (!getLangOpts().CPlusPlus17 &&
1856
1.60k
               
std::find(BufferPtr, CurPtr, '_') != CurPtr1.27k
)
1857
3
        IsHexFloat = false;
1858
1.61k
    }
1859
7.29k
    if (IsHexFloat)
1860
7.28k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1861
7.29k
  }
1862
1863
  // If we have a digit separator, continue.
1864
60.0M
  if (C == '\'' && 
(554
getLangOpts().CPlusPlus14554
||
getLangOpts().C2x72
)) {
1865
532
    unsigned NextSize;
1866
532
    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1867
532
    if (isIdentifierBody(Next)) {
1868
514
      if (!isLexingRawMode())
1869
452
        Diag(CurPtr, getLangOpts().CPlusPlus
1870
452
                         ? 
diag::warn_cxx11_compat_digit_separator410
1871
452
                         : 
diag::warn_c2x_compat_digit_separator42
);
1872
514
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1873
514
      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1874
514
      return LexNumericConstant(Result, CurPtr);
1875
514
    }
1876
532
  }
1877
1878
  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1879
60.0M
  if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)6
)
1880
5
    return LexNumericConstant(Result, CurPtr);
1881
60.0M
  if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)13
)
1882
13
    return LexNumericConstant(Result, CurPtr);
1883
1884
  // Update the location of token as well as BufferPtr.
1885
60.0M
  const char *TokStart = BufferPtr;
1886
60.0M
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1887
60.0M
  Result.setLiteralData(TokStart);
1888
60.0M
  return true;
1889
60.0M
}
1890
1891
/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1892
/// in C++11, or warn on a ud-suffix in C++98.
1893
const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1894
9.31M
                               bool IsStringLiteral) {
1895
9.31M
  assert(getLangOpts().CPlusPlus);
1896
1897
  // Maximally munch an identifier.
1898
0
  unsigned Size;
1899
9.31M
  char C = getCharAndSize(CurPtr, Size);
1900
9.31M
  bool Consumed = false;
1901
1902
9.31M
  if (!isIdentifierHead(C)) {
1903
9.30M
    if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)7
)
1904
0
      Consumed = true;
1905
9.30M
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)0
)
1906
0
      Consumed = true;
1907
9.30M
    else
1908
9.30M
      return CurPtr;
1909
9.30M
  }
1910
1911
7.10k
  if (!getLangOpts().CPlusPlus11) {
1912
17
    if (!isLexingRawMode())
1913
8
      Diag(CurPtr,
1914
8
           C == '_' ? 
diag::warn_cxx11_compat_user_defined_literal3
1915
8
                    : 
diag::warn_cxx11_compat_reserved_user_defined_literal5
)
1916
8
        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1917
17
    return CurPtr;
1918
17
  }
1919
1920
  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1921
  // that does not start with an underscore is ill-formed. As a conforming
1922
  // extension, we treat all such suffixes as if they had whitespace before
1923
  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1924
  // likely to be a ud-suffix than a macro, however, and accept that.
1925
7.10k
  
if (7.08k
!Consumed7.08k
) {
1926
7.10k
    bool IsUDSuffix = false;
1927
7.10k
    if (C == '_')
1928
370
      IsUDSuffix = true;
1929
6.73k
    else if (IsStringLiteral && 
getLangOpts().CPlusPlus146.72k
) {
1930
      // In C++1y, we need to look ahead a few characters to see if this is a
1931
      // valid suffix for a string literal or a numeric literal (this could be
1932
      // the 'operator""if' defining a numeric literal operator).
1933
537
      const unsigned MaxStandardSuffixLength = 3;
1934
537
      char Buffer[MaxStandardSuffixLength] = { C };
1935
537
      unsigned Consumed = Size;
1936
537
      unsigned Chars = 1;
1937
923
      while (true) {
1938
923
        unsigned NextSize;
1939
923
        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1940
923
                                         getLangOpts());
1941
923
        if (!isIdentifierBody(Next)) {
1942
          // End of suffix. Check whether this is on the allowed list.
1943
533
          const StringRef CompleteSuffix(Buffer, Chars);
1944
533
          IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1945
533
                                                            CompleteSuffix);
1946
533
          break;
1947
533
        }
1948
1949
390
        if (Chars == MaxStandardSuffixLength)
1950
          // Too long: can't be a standard suffix.
1951
4
          break;
1952
1953
386
        Buffer[Chars++] = Next;
1954
386
        Consumed += NextSize;
1955
386
      }
1956
537
    }
1957
1958
7.10k
    if (!IsUDSuffix) {
1959
6.26k
      if (!isLexingRawMode())
1960
9
        Diag(CurPtr, getLangOpts().MSVCCompat
1961
9
                         ? 
diag::ext_ms_reserved_user_defined_literal0
1962
9
                         : diag::ext_reserved_user_defined_literal)
1963
9
          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1964
6.26k
      return CurPtr;
1965
6.26k
    }
1966
1967
839
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1968
839
  }
1969
1970
818
  Result.setFlag(Token::HasUDSuffix);
1971
2.02k
  while (
true2.00k
) {
1972
2.02k
    C = getCharAndSize(CurPtr, Size);
1973
2.02k
    if (isIdentifierBody(C)) 
{ CurPtr = ConsumeChar(CurPtr, Size, Result); }1.15k
1974
873
    else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)18
)
{}18
1975
855
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)16
)
{}16
1976
839
    else break;
1977
2.02k
  }
1978
1979
818
  return CurPtr;
1980
7.08k
}
1981
1982
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1983
/// either " or L" or u8" or u" or U".
1984
bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1985
13.3M
                             tok::TokenKind Kind) {
1986
13.3M
  const char *AfterQuote = CurPtr;
1987
  // Does this string contain the \0 character?
1988
13.3M
  const char *NulCharacter = nullptr;
1989
1990
13.3M
  if (!isLexingRawMode() &&
1991
13.3M
      
(11.1M
Kind == tok::utf8_string_literal11.1M
||
1992
11.1M
       
Kind == tok::utf16_string_literal11.1M
||
1993
11.1M
       
Kind == tok::utf32_string_literal11.1M
))
1994
566
    Diag(BufferPtr, getLangOpts().CPlusPlus
1995
566
           ? 
diag::warn_cxx98_compat_unicode_literal486
1996
566
           : 
diag::warn_c99_compat_unicode_literal80
);
1997
1998
13.3M
  char C = getAndAdvanceChar(CurPtr, Result);
1999
137M
  while (C != '"') {
2000
    // Skip escaped characters.  Escaped newlines will already be processed by
2001
    // getAndAdvanceChar.
2002
123M
    if (C == '\\')
2003
244k
      C = getAndAdvanceChar(CurPtr, Result);
2004
2005
123M
    if (C == '\n' || 
C == '\r'123M
|| // Newline.
2006
123M
        (C == 0 && 
CurPtr-1 == BufferEnd43
)) { // End of file.
2007
68
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor7
)
2008
5
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2009
68
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2010
68
      return true;
2011
68
    }
2012
2013
123M
    if (C == 0) {
2014
21
      if (isCodeCompletionPoint(CurPtr-1)) {
2015
12
        if (ParsingFilename)
2016
5
          codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2017
7
        else
2018
7
          PP->CodeCompleteNaturalLanguage();
2019
12
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2020
12
        cutOffLexing();
2021
12
        return true;
2022
12
      }
2023
2024
9
      NulCharacter = CurPtr-1;
2025
9
    }
2026
123M
    C = getAndAdvanceChar(CurPtr, Result);
2027
123M
  }
2028
2029
  // If we are in C++11, lex the optional ud-suffix.
2030
13.3M
  if (getLangOpts().CPlusPlus)
2031
9.14M
    CurPtr = LexUDSuffix(Result, CurPtr, true);
2032
2033
  // If a nul character existed in the string, warn about it.
2034
13.3M
  if (NulCharacter && 
!isLexingRawMode()8
)
2035
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2036
2037
  // Update the location of the token as well as the BufferPtr instance var.
2038
13.3M
  const char *TokStart = BufferPtr;
2039
13.3M
  FormTokenWithChars(Result, CurPtr, Kind);
2040
13.3M
  Result.setLiteralData(TokStart);
2041
13.3M
  return true;
2042
13.3M
}
2043
2044
/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2045
/// having lexed R", LR", u8R", uR", or UR".
2046
bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2047
530
                                tok::TokenKind Kind) {
2048
  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2049
  //  Between the initial and final double quote characters of the raw string,
2050
  //  any transformations performed in phases 1 and 2 (trigraphs,
2051
  //  universal-character-names, and line splicing) are reverted.
2052
2053
530
  if (!isLexingRawMode())
2054
130
    Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2055
2056
530
  unsigned PrefixLen = 0;
2057
2058
1.50k
  while (PrefixLen != 16 && 
isRawStringDelimBody(CurPtr[PrefixLen])1.50k
)
2059
979
    ++PrefixLen;
2060
2061
  // If the last character was not a '(', then we didn't lex a valid delimiter.
2062
530
  if (CurPtr[PrefixLen] != '(') {
2063
1
    if (!isLexingRawMode()) {
2064
1
      const char *PrefixEnd = &CurPtr[PrefixLen];
2065
1
      if (PrefixLen == 16) {
2066
1
        Diag(PrefixEnd, diag::err_raw_delim_too_long);
2067
1
      } else {
2068
0
        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2069
0
          << StringRef(PrefixEnd, 1);
2070
0
      }
2071
1
    }
2072
2073
    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2074
    // it's possible the '"' was intended to be part of the raw string, but
2075
    // there's not much we can do about that.
2076
59
    while (true) {
2077
59
      char C = *CurPtr++;
2078
2079
59
      if (C == '"')
2080
1
        break;
2081
58
      if (C == 0 && 
CurPtr-1 == BufferEnd0
) {
2082
0
        --CurPtr;
2083
0
        break;
2084
0
      }
2085
58
    }
2086
2087
1
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2088
1
    return true;
2089
1
  }
2090
2091
  // Save prefix and move CurPtr past it
2092
529
  const char *Prefix = CurPtr;
2093
529
  CurPtr += PrefixLen + 1; // skip over prefix and '('
2094
2095
8.81k
  while (true) {
2096
8.81k
    char C = *CurPtr++;
2097
2098
8.81k
    if (C == ')') {
2099
      // Check for prefix match and closing quote.
2100
533
      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && 
CurPtr[PrefixLen] == '"'525
) {
2101
525
        CurPtr += PrefixLen + 1; // skip over prefix and '"'
2102
525
        break;
2103
525
      }
2104
8.27k
    } else if (C == 0 && 
CurPtr-1 == BufferEnd4
) { // End of file.
2105
4
      if (!isLexingRawMode())
2106
1
        Diag(BufferPtr, diag::err_unterminated_raw_string)
2107
1
          << StringRef(Prefix, PrefixLen);
2108
4
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2109
4
      return true;
2110
4
    }
2111
8.81k
  }
2112
2113
  // If we are in C++11, lex the optional ud-suffix.
2114
525
  if (getLangOpts().CPlusPlus)
2115
525
    CurPtr = LexUDSuffix(Result, CurPtr, true);
2116
2117
  // Update the location of token as well as BufferPtr.
2118
525
  const char *TokStart = BufferPtr;
2119
525
  FormTokenWithChars(Result, CurPtr, Kind);
2120
525
  Result.setLiteralData(TokStart);
2121
525
  return true;
2122
529
}
2123
2124
/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2125
/// after having lexed the '<' character.  This is used for #include filenames.
2126
1.55M
bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2127
  // Does this string contain the \0 character?
2128
1.55M
  const char *NulCharacter = nullptr;
2129
1.55M
  const char *AfterLessPos = CurPtr;
2130
1.55M
  char C = getAndAdvanceChar(CurPtr, Result);
2131
28.8M
  while (C != '>') {
2132
    // Skip escaped characters.  Escaped newlines will already be processed by
2133
    // getAndAdvanceChar.
2134
27.3M
    if (C == '\\')
2135
6
      C = getAndAdvanceChar(CurPtr, Result);
2136
2137
27.3M
    if (isVerticalWhitespace(C) ||               // Newline.
2138
27.3M
        
(27.3M
C == 027.3M
&&
(CurPtr - 1 == BufferEnd)10
)) { // End of file.
2139
      // If the filename is unterminated, then it must just be a lone <
2140
      // character.  Return this as such.
2141
10
      FormTokenWithChars(Result, AfterLessPos, tok::less);
2142
10
      return true;
2143
10
    }
2144
2145
27.3M
    if (C == 0) {
2146
7
      if (isCodeCompletionPoint(CurPtr - 1)) {
2147
6
        codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2148
6
        cutOffLexing();
2149
6
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2150
6
        return true;
2151
6
      }
2152
1
      NulCharacter = CurPtr-1;
2153
1
    }
2154
27.3M
    C = getAndAdvanceChar(CurPtr, Result);
2155
27.3M
  }
2156
2157
  // If a nul character existed in the string, warn about it.
2158
1.55M
  if (NulCharacter && 
!isLexingRawMode()1
)
2159
1
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2160
2161
  // Update the location of token as well as BufferPtr.
2162
1.55M
  const char *TokStart = BufferPtr;
2163
1.55M
  FormTokenWithChars(Result, CurPtr, tok::header_name);
2164
1.55M
  Result.setLiteralData(TokStart);
2165
1.55M
  return true;
2166
1.55M
}
2167
2168
void Lexer::codeCompleteIncludedFile(const char *PathStart,
2169
                                     const char *CompletionPoint,
2170
11
                                     bool IsAngled) {
2171
  // Completion only applies to the filename, after the last slash.
2172
11
  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2173
11
  llvm::StringRef SlashChars = LangOpts.MSVCCompat ? 
"/\\"1
:
"/"10
;
2174
11
  auto Slash = PartialPath.find_last_of(SlashChars);
2175
11
  StringRef Dir =
2176
11
      (Slash == StringRef::npos) ? 
""7
:
PartialPath.take_front(Slash)4
;
2177
11
  const char *StartOfFilename =
2178
11
      (Slash == StringRef::npos) ? 
PathStart7
:
PathStart + Slash + 14
;
2179
  // Code completion filter range is the filename only, up to completion point.
2180
11
  PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2181
11
      StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2182
  // We should replace the characters up to the closing quote or closest slash,
2183
  // if any.
2184
70
  while (CompletionPoint < BufferEnd) {
2185
70
    char Next = *(CompletionPoint + 1);
2186
70
    if (Next == 0 || Next == '\r' || Next == '\n')
2187
0
      break;
2188
70
    ++CompletionPoint;
2189
70
    if (Next == (IsAngled ? 
'>'45
:
'"'25
))
2190
10
      break;
2191
60
    if (llvm::is_contained(SlashChars, Next))
2192
1
      break;
2193
60
  }
2194
2195
11
  PP->setCodeCompletionTokenRange(
2196
11
      FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2197
11
      FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2198
11
  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2199
11
}
2200
2201
/// LexCharConstant - Lex the remainder of a character constant, after having
2202
/// lexed either ' or L' or u8' or u' or U'.
2203
bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2204
642k
                            tok::TokenKind Kind) {
2205
  // Does this character contain the \0 character?
2206
642k
  const char *NulCharacter = nullptr;
2207
2208
642k
  if (!isLexingRawMode()) {
2209
613k
    if (Kind == tok::utf16_char_constant || 
Kind == tok::utf32_char_constant613k
)
2210
174
      Diag(BufferPtr, getLangOpts().CPlusPlus
2211
174
                          ? 
diag::warn_cxx98_compat_unicode_literal160
2212
174
                          : 
diag::warn_c99_compat_unicode_literal14
);
2213
613k
    else if (Kind == tok::utf8_char_constant)
2214
148
      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2215
613k
  }
2216
2217
642k
  char C = getAndAdvanceChar(CurPtr, Result);
2218
642k
  if (C == '\'') {
2219
28
    if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor2
)
2220
0
      Diag(BufferPtr, diag::ext_empty_character);
2221
28
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2222
28
    return true;
2223
28
  }
2224
2225
2.87M
  
while (642k
C != '\'') {
2226
    // Skip escaped characters.
2227
2.22M
    if (C == '\\')
2228
10.4k
      C = getAndAdvanceChar(CurPtr, Result);
2229
2230
2.22M
    if (C == '\n' || 
C == '\r'2.22M
|| // Newline.
2231
2.22M
        
(2.22M
C == 02.22M
&&
CurPtr-1 == BufferEnd19
)) { // End of file.
2232
54
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor7
)
2233
4
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2234
54
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2235
54
      return true;
2236
54
    }
2237
2238
2.22M
    if (C == 0) {
2239
14
      if (isCodeCompletionPoint(CurPtr-1)) {
2240
6
        PP->CodeCompleteNaturalLanguage();
2241
6
        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2242
6
        cutOffLexing();
2243
6
        return true;
2244
6
      }
2245
2246
8
      NulCharacter = CurPtr-1;
2247
8
    }
2248
2.22M
    C = getAndAdvanceChar(CurPtr, Result);
2249
2.22M
  }
2250
2251
  // If we are in C++11, lex the optional ud-suffix.
2252
642k
  if (getLangOpts().CPlusPlus)
2253
168k
    CurPtr = LexUDSuffix(Result, CurPtr, false);
2254
2255
  // If a nul character existed in the character, warn about it.
2256
642k
  if (NulCharacter && 
!isLexingRawMode()8
)
2257
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2258
2259
  // Update the location of token as well as BufferPtr.
2260
642k
  const char *TokStart = BufferPtr;
2261
642k
  FormTokenWithChars(Result, CurPtr, Kind);
2262
642k
  Result.setLiteralData(TokStart);
2263
642k
  return true;
2264
642k
}
2265
2266
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2267
/// Update BufferPtr to point to the next non-whitespace character and return.
2268
///
2269
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2270
bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2271
196M
                           bool &TokAtPhysicalStartOfLine) {
2272
  // Whitespace - Skip it, then return the token after the whitespace.
2273
196M
  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2274
2275
196M
  unsigned char Char = *CurPtr;
2276
2277
196M
  const char *lastNewLine = nullptr;
2278
214M
  auto setLastNewLine = [&](const char *Ptr) {
2279
214M
    lastNewLine = Ptr;
2280
214M
    if (!NewLinePtr)
2281
181M
      NewLinePtr = Ptr;
2282
214M
  };
2283
196M
  if (SawNewline)
2284
191M
    setLastNewLine(CurPtr - 1);
2285
2286
  // Skip consecutive spaces efficiently.
2287
219M
  while (true) {
2288
    // Skip horizontal whitespace very aggressively.
2289
514M
    while (isHorizontalWhitespace(Char))
2290
295M
      Char = *++CurPtr;
2291
2292
    // Otherwise if we have something other than whitespace, we're done.
2293
219M
    if (!isVerticalWhitespace(Char))
2294
196M
      break;
2295
2296
22.9M
    if (ParsingPreprocessorDirective) {
2297
      // End of preprocessor directive line, let LexTokenInternal handle this.
2298
1.39k
      BufferPtr = CurPtr;
2299
1.39k
      return false;
2300
1.39k
    }
2301
2302
    // OK, but handle newline.
2303
22.9M
    if (*CurPtr == '\n')
2304
22.9M
      setLastNewLine(CurPtr);
2305
22.9M
    SawNewline = true;
2306
22.9M
    Char = *++CurPtr;
2307
22.9M
  }
2308
2309
  // If the client wants us to return whitespace, return it now.
2310
196M
  if (isKeepWhitespaceMode()) {
2311
68.9k
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2312
68.9k
    if (SawNewline) {
2313
68.2k
      IsAtStartOfLine = true;
2314
68.2k
      IsAtPhysicalStartOfLine = true;
2315
68.2k
    }
2316
    // FIXME: The next token will not have LeadingSpace set.
2317
68.9k
    return true;
2318
68.9k
  }
2319
2320
  // If this isn't immediately after a newline, there is leading space.
2321
196M
  char PrevChar = CurPtr[-1];
2322
196M
  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2323
2324
196M
  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2325
196M
  if (SawNewline) {
2326
191M
    Result.setFlag(Token::StartOfLine);
2327
191M
    TokAtPhysicalStartOfLine = true;
2328
2329
191M
    if (NewLinePtr && 
lastNewLine191M
&&
NewLinePtr != lastNewLine191M
&&
PP28.1M
) {
2330
27.4M
      if (auto *Handler = PP->getEmptylineHandler())
2331
184
        Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2332
184
                                             getSourceLocation(lastNewLine)));
2333
27.4M
    }
2334
191M
  }
2335
2336
196M
  BufferPtr = CurPtr;
2337
196M
  return false;
2338
196M
}
2339
2340
/// We have just read the // characters from input.  Skip until we find the
2341
/// newline character that terminates the comment.  Then update BufferPtr and
2342
/// return.
2343
///
2344
/// If we're in KeepCommentMode or any CommentHandler has inserted
2345
/// some tokens, this will store the first token and return true.
2346
bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2347
45.9M
                            bool &TokAtPhysicalStartOfLine) {
2348
  // If Line comments aren't explicitly enabled for this language, emit an
2349
  // extension warning.
2350
45.9M
  if (!LangOpts.LineComment && 
!isLexingRawMode()7.15k
) {
2351
3.63k
    Diag(BufferPtr, diag::ext_line_comment);
2352
2353
    // Mark them enabled so we only emit one warning for this translation
2354
    // unit.
2355
3.63k
    LangOpts.LineComment = true;
2356
3.63k
  }
2357
2358
  // Scan over the body of the comment.  The common case, when scanning, is that
2359
  // the comment contains normal ascii characters with nothing interesting in
2360
  // them.  As such, optimize for this case with the inner loop.
2361
  //
2362
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2363
  // character that ends the line comment.
2364
45.9M
  char C;
2365
46.0M
  while (true) {
2366
46.0M
    C = *CurPtr;
2367
    // Skip over characters in the fast loop.
2368
2.92G
    while (C != 0 &&                // Potentially EOF.
2369
2.92G
           
C != '\n'2.92G
&&
C != '\r'2.87G
) // Newline or DOS-style newline.
2370
2.87G
      C = *++CurPtr;
2371
2372
46.0M
    const char *NextLine = CurPtr;
2373
46.0M
    if (C != 0) {
2374
      // We found a newline, see if it's escaped.
2375
46.0M
      const char *EscapePtr = CurPtr-1;
2376
46.0M
      bool HasSpace = false;
2377
46.1M
      while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2378
17.9k
        --EscapePtr;
2379
17.9k
        HasSpace = true;
2380
17.9k
      }
2381
2382
46.0M
      if (*EscapePtr == '\\')
2383
        // Escaped newline.
2384
119k
        CurPtr = EscapePtr;
2385
45.9M
      else if (EscapePtr[0] == '/' && 
EscapePtr[-1] == '?'3.74M
&&
2386
45.9M
               
EscapePtr[-2] == '?'8
&&
LangOpts.Trigraphs8
)
2387
        // Trigraph-escaped newline.
2388
3
        CurPtr = EscapePtr-2;
2389
45.9M
      else
2390
45.9M
        break; // This is a newline, we're done.
2391
2392
      // If there was space between the backslash and newline, warn about it.
2393
119k
      if (HasSpace && 
!isLexingRawMode()8
)
2394
6
        Diag(EscapePtr, diag::backslash_newline_space);
2395
119k
    }
2396
2397
    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2398
    // properly decode the character.  Read it in raw mode to avoid emitting
2399
    // diagnostics about things like trigraphs.  If we see an escaped newline,
2400
    // we'll handle it below.
2401
124k
    const char *OldPtr = CurPtr;
2402
124k
    bool OldRawMode = isLexingRawMode();
2403
124k
    LexingRawMode = true;
2404
124k
    C = getAndAdvanceChar(CurPtr, Result);
2405
124k
    LexingRawMode = OldRawMode;
2406
2407
    // If we only read only one character, then no special handling is needed.
2408
    // We're done and can skip forward to the newline.
2409
124k
    if (C != 0 && 
CurPtr == OldPtr+1119k
) {
2410
0
      CurPtr = NextLine;
2411
0
      break;
2412
0
    }
2413
2414
    // If we read multiple characters, and one of those characters was a \r or
2415
    // \n, then we had an escaped newline within the comment.  Emit diagnostic
2416
    // unless the next line is also a // comment.
2417
124k
    if (CurPtr != OldPtr + 1 && 
C != '/'119k
&&
2418
124k
        
(4.94k
CurPtr == BufferEnd + 14.94k
||
CurPtr[0] != '/'4.93k
)) {
2419
9.85k
      for (; OldPtr != CurPtr; 
++OldPtr4.93k
)
2420
9.85k
        if (OldPtr[0] == '\n' || 
OldPtr[0] == '\r'4.93k
) {
2421
          // Okay, we found a // comment that ends in a newline, if the next
2422
          // line is also a // comment, but has spaces, don't emit a diagnostic.
2423
4.92k
          if (isWhitespace(C)) {
2424
4.71k
            const char *ForwardPtr = CurPtr;
2425
63.6k
            while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2426
58.9k
              ++ForwardPtr;
2427
4.71k
            if (ForwardPtr[0] == '/' && 
ForwardPtr[1] == '/'4.19k
)
2428
4.19k
              break;
2429
4.71k
          }
2430
2431
729
          if (!isLexingRawMode())
2432
659
            Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2433
729
          break;
2434
4.92k
        }
2435
4.92k
    }
2436
2437
124k
    if (
C == '\r'124k
|| C == '\n' ||
CurPtr == BufferEnd + 1124k
) {
2438
5.15k
      --CurPtr;
2439
5.15k
      break;
2440
5.15k
    }
2441
2442
119k
    if (C == '\0' && 
isCodeCompletionPoint(CurPtr-1)9
) {
2443
8
      PP->CodeCompleteNaturalLanguage();
2444
8
      cutOffLexing();
2445
8
      return false;
2446
8
    }
2447
119k
  }
2448
2449
  // Found but did not consume the newline.  Notify comment handlers about the
2450
  // comment unless we're in a #if 0 block.
2451
45.9M
  if (PP && 
!isLexingRawMode()45.8M
&&
2452
45.9M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2453
42.3M
                                            getSourceLocation(CurPtr)))) {
2454
0
    BufferPtr = CurPtr;
2455
0
    return true; // A token has to be returned.
2456
0
  }
2457
2458
  // If we are returning comments as tokens, return this comment as a token.
2459
45.9M
  if (inKeepCommentMode())
2460
43.4k
    return SaveLineComment(Result, CurPtr);
2461
2462
  // If we are inside a preprocessor directive and we see the end of line,
2463
  // return immediately, so that the lexer can return this as an EOD token.
2464
45.9M
  if (ParsingPreprocessorDirective || 
CurPtr == BufferEnd45.1M
) {
2465
740k
    BufferPtr = CurPtr;
2466
740k
    return false;
2467
740k
  }
2468
2469
  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2470
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2471
  // contribute to another token), it isn't needed for correctness.  Note that
2472
  // this is ok even in KeepWhitespaceMode, because we would have returned the
2473
  /// comment above in that mode.
2474
45.1M
  NewLinePtr = CurPtr++;
2475
2476
  // The next returned token is at the start of the line.
2477
45.1M
  Result.setFlag(Token::StartOfLine);
2478
45.1M
  TokAtPhysicalStartOfLine = true;
2479
  // No leading whitespace seen so far.
2480
45.1M
  Result.clearFlag(Token::LeadingSpace);
2481
45.1M
  BufferPtr = CurPtr;
2482
45.1M
  return false;
2483
45.9M
}
2484
2485
/// If in save-comment mode, package up this Line comment in an appropriate
2486
/// way and return it.
2487
43.4k
bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2488
  // If we're not in a preprocessor directive, just return the // comment
2489
  // directly.
2490
43.4k
  FormTokenWithChars(Result, CurPtr, tok::comment);
2491
2492
43.4k
  if (!ParsingPreprocessorDirective || 
LexingRawMode2
)
2493
43.4k
    return true;
2494
2495
  // If this Line-style comment is in a macro definition, transmogrify it into
2496
  // a C-style block comment.
2497
2
  bool Invalid = false;
2498
2
  std::string Spelling = PP->getSpelling(Result, &Invalid);
2499
2
  if (Invalid)
2500
0
    return true;
2501
2502
2
  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2503
0
  Spelling[1] = '*';   // Change prefix to "/*".
2504
2
  Spelling += "*/";    // add suffix.
2505
2506
2
  Result.setKind(tok::comment);
2507
2
  PP->CreateString(Spelling, Result,
2508
2
                   Result.getLocation(), Result.getLocation());
2509
2
  return true;
2510
2
}
2511
2512
/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2513
/// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2514
/// a diagnostic if so.  We know that the newline is inside of a block comment.
2515
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2516
64.7k
                                                  Lexer *L) {
2517
64.7k
  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2518
2519
  // Position of the first trigraph in the ending sequence.
2520
0
  const char *TrigraphPos = 0;
2521
  // Position of the first whitespace after a '\' in the ending sequence.
2522
64.7k
  const char *SpacePos = 0;
2523
2524
64.7k
  while (true) {
2525
    // Back up off the newline.
2526
64.7k
    --CurPtr;
2527
2528
    // If this is a two-character newline sequence, skip the other character.
2529
64.7k
    if (CurPtr[0] == '\n' || 
CurPtr[0] == '\r'5.69k
) {
2530
      // \n\n or \r\r -> not escaped newline.
2531
59.1k
      if (CurPtr[0] == CurPtr[1])
2532
59.1k
        return false;
2533
      // \n\r or \r\n -> skip the newline.
2534
0
      --CurPtr;
2535
0
    }
2536
2537
    // If we have horizontal whitespace, skip over it.  We allow whitespace
2538
    // between the slash and newline.
2539
5.74k
    
while (5.69k
isHorizontalWhitespace(*CurPtr) ||
*CurPtr == 05.69k
) {
2540
53
      SpacePos = CurPtr;
2541
53
      --CurPtr;
2542
53
    }
2543
2544
    // If we have a slash, this is an escaped newline.
2545
5.69k
    if (*CurPtr == '\\') {
2546
18
      --CurPtr;
2547
5.67k
    } else if (CurPtr[0] == '/' && 
CurPtr[-1] == '?'672
&&
CurPtr[-2] == '?'15
) {
2548
      // This is a trigraph encoding of a slash.
2549
15
      TrigraphPos = CurPtr - 2;
2550
15
      CurPtr -= 3;
2551
5.65k
    } else {
2552
5.65k
      return false;
2553
5.65k
    }
2554
2555
    // If the character preceding the escaped newline is a '*', then after line
2556
    // splicing we have a '*/' ending the comment.
2557
33
    if (*CurPtr == '*')
2558
18
      break;
2559
2560
15
    if (*CurPtr != '\n' && 
*CurPtr != '\r'0
)
2561
0
      return false;
2562
15
  }
2563
2564
18
  if (TrigraphPos) {
2565
    // If no trigraphs are enabled, warn that we ignored this trigraph and
2566
    // ignore this * character.
2567
10
    if (!L->getLangOpts().Trigraphs) {
2568
0
      if (!L->isLexingRawMode())
2569
0
        L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2570
0
      return false;
2571
0
    }
2572
10
    if (!L->isLexingRawMode())
2573
10
      L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2574
10
  }
2575
2576
  // Warn about having an escaped newline between the */ characters.
2577
18
  if (!L->isLexingRawMode())
2578
15
    L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2579
2580
  // If there was space between the backslash and newline, warn about it.
2581
18
  if (SpacePos && !L->isLexingRawMode())
2582
15
    L->Diag(SpacePos, diag::backslash_newline_space);
2583
2584
18
  return true;
2585
18
}
2586
2587
#ifdef __SSE2__
2588
#include <emmintrin.h>
2589
#elif __ALTIVEC__
2590
#include <altivec.h>
2591
#undef bool
2592
#endif
2593
2594
/// We have just read from input the / and * characters that started a comment.
2595
/// Read until we find the * and / characters that terminate the comment.
2596
/// Note that we don't bother decoding trigraphs or escaped newlines in block
2597
/// comments, because they cannot cause the comment to end.  The only thing
2598
/// that can happen is the comment could end with an escaped newline between
2599
/// the terminating * and /.
2600
///
2601
/// If we're in KeepCommentMode or any CommentHandler has inserted
2602
/// some tokens, this will store the first token and return true.
2603
bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2604
32.7M
                             bool &TokAtPhysicalStartOfLine) {
2605
  // Scan one character past where we should, looking for a '/' character.  Once
2606
  // we find it, check to see if it was preceded by a *.  This common
2607
  // optimization helps people who like to put a lot of * characters in their
2608
  // comments.
2609
2610
  // The first character we get with newlines and trigraphs skipped to handle
2611
  // the degenerate /*/ case below correctly if the * has an escaped newline
2612
  // after it.
2613
32.7M
  unsigned CharSize;
2614
32.7M
  unsigned char C = getCharAndSize(CurPtr, CharSize);
2615
32.7M
  CurPtr += CharSize;
2616
32.7M
  if (C == 0 && 
CurPtr == BufferEnd+14
) {
2617
3
    if (!isLexingRawMode())
2618
0
      Diag(BufferPtr, diag::err_unterminated_block_comment);
2619
3
    --CurPtr;
2620
2621
    // KeepWhitespaceMode should return this broken comment as a token.  Since
2622
    // it isn't a well formed comment, just return it as an 'unknown' token.
2623
3
    if (isKeepWhitespaceMode()) {
2624
2
      FormTokenWithChars(Result, CurPtr, tok::unknown);
2625
2
      return true;
2626
2
    }
2627
2628
1
    BufferPtr = CurPtr;
2629
1
    return false;
2630
3
  }
2631
2632
  // Check to see if the first character after the '/*' is another /.  If so,
2633
  // then this slash does not end the block comment, it is part of it.
2634
32.7M
  if (C == '/')
2635
230
    C = *CurPtr++;
2636
2637
37.6M
  while (true) {
2638
    // Skip over all non-interesting characters until we find end of buffer or a
2639
    // (probably ending) '/' character.
2640
37.6M
    if (CurPtr + 24 < BufferEnd &&
2641
        // If there is a code-completion point avoid the fast scan because it
2642
        // doesn't check for '\0'.
2643
37.6M
        
!(37.3M
PP37.3M
&&
PP->getCodeCompletionFileLoc() == FileLoc37.3M
)) {
2644
      // While not aligned to a 16-byte boundary.
2645
293M
      while (C != '/' && 
((intptr_t)CurPtr & 0x0F) != 0291M
)
2646
256M
        C = *CurPtr++;
2647
2648
37.3M
      if (C == '/') 
goto FoundSlash2.46M
;
2649
2650
34.9M
#ifdef __SSE2__
2651
34.9M
      __m128i Slashes = _mm_set1_epi8('/');
2652
262M
      while (CurPtr+16 <= BufferEnd) {
2653
262M
        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2654
262M
                                    Slashes));
2655
262M
        if (cmp != 0) {
2656
          // Adjust the pointer to point directly after the first slash. It's
2657
          // not necessary to set C here, it will be overwritten at the end of
2658
          // the outer loop.
2659
34.8M
          CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2660
34.8M
          goto FoundSlash;
2661
34.8M
        }
2662
228M
        CurPtr += 16;
2663
228M
      }
2664
#elif __ALTIVEC__
2665
      __vector unsigned char Slashes = {
2666
        '/', '/', '/', '/',  '/', '/', '/', '/',
2667
        '/', '/', '/', '/',  '/', '/', '/', '/'
2668
      };
2669
      while (CurPtr + 16 <= BufferEnd &&
2670
             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
2671
        CurPtr += 16;
2672
#else
2673
      // Scan for '/' quickly.  Many block comments are very large.
2674
      while (CurPtr[0] != '/' &&
2675
             CurPtr[1] != '/' &&
2676
             CurPtr[2] != '/' &&
2677
             CurPtr[3] != '/' &&
2678
             CurPtr+4 < BufferEnd) {
2679
        CurPtr += 4;
2680
      }
2681
#endif
2682
2683
      // It has to be one of the bytes scanned, increment to it and read one.
2684
44.9k
      C = *CurPtr++;
2685
44.9k
    }
2686
2687
    // Loop to scan the remainder.
2688
4.29M
    
while (287k
C != '/' &&
C != '\0'4.00M
)
2689
4.00M
      C = *CurPtr++;
2690
2691
287k
    if (C == '/') {
2692
37.6M
  FoundSlash:
2693
37.6M
      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2694
32.7M
        break;
2695
2696
4.88M
      if ((CurPtr[-2] == '\n' || 
CurPtr[-2] == '\r'4.82M
)) {
2697
64.7k
        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2698
          // We found the final */, though it had an escaped newline between the
2699
          // * and /.  We're done!
2700
18
          break;
2701
18
        }
2702
64.7k
      }
2703
4.88M
      if (CurPtr[0] == '*' && 
CurPtr[1] != '/'229
) {
2704
        // If this is a /* inside of the comment, emit a warning.  Don't do this
2705
        // if this is a /*/, which will end the comment.  This misses cases with
2706
        // embedded escaped newlines, but oh well.
2707
10
        if (!isLexingRawMode())
2708
9
          Diag(CurPtr-1, diag::warn_nested_block_comment);
2709
10
      }
2710
4.88M
    } else 
if (14
C == 014
&&
CurPtr == BufferEnd+114
) {
2711
2
      if (!isLexingRawMode())
2712
2
        Diag(BufferPtr, diag::err_unterminated_block_comment);
2713
      // Note: the user probably forgot a */.  We could continue immediately
2714
      // after the /*, but this would involve lexing a lot of what really is the
2715
      // comment, which surely would confuse the parser.
2716
2
      --CurPtr;
2717
2718
      // KeepWhitespaceMode should return this broken comment as a token.  Since
2719
      // it isn't a well formed comment, just return it as an 'unknown' token.
2720
2
      if (isKeepWhitespaceMode()) {
2721
0
        FormTokenWithChars(Result, CurPtr, tok::unknown);
2722
0
        return true;
2723
0
      }
2724
2725
2
      BufferPtr = CurPtr;
2726
2
      return false;
2727
12
    } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2728
9
      PP->CodeCompleteNaturalLanguage();
2729
9
      cutOffLexing();
2730
9
      return false;
2731
9
    }
2732
2733
4.88M
    C = *CurPtr++;
2734
4.88M
  }
2735
2736
  // Notify comment handlers about the comment unless we're in a #if 0 block.
2737
32.7M
  if (PP && 
!isLexingRawMode()32.7M
&&
2738
32.7M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2739
30.2M
                                            getSourceLocation(CurPtr)))) {
2740
0
    BufferPtr = CurPtr;
2741
0
    return true; // A token has to be returned.
2742
0
  }
2743
2744
  // If we are returning comments as tokens, return this comment as a token.
2745
32.7M
  if (inKeepCommentMode()) {
2746
2.60k
    FormTokenWithChars(Result, CurPtr, tok::comment);
2747
2.60k
    return true;
2748
2.60k
  }
2749
2750
  // It is common for the tokens immediately after a /**/ comment to be
2751
  // whitespace.  Instead of going through the big switch, handle it
2752
  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2753
  // have already returned above with the comment as a token.
2754
32.7M
  if (isHorizontalWhitespace(*CurPtr)) {
2755
79.2k
    SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2756
79.2k
    return false;
2757
79.2k
  }
2758
2759
  // Otherwise, just return so that the next character will be lexed as a token.
2760
32.6M
  BufferPtr = CurPtr;
2761
32.6M
  Result.setFlag(Token::LeadingSpace);
2762
32.6M
  return false;
2763
32.7M
}
2764
2765
//===----------------------------------------------------------------------===//
2766
// Primary Lexing Entry Points
2767
//===----------------------------------------------------------------------===//
2768
2769
/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2770
/// uninterpreted string.  This switches the lexer out of directive mode.
2771
38.0k
void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2772
38.0k
  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2773
38.0k
         "Must be in a preprocessing directive!");
2774
0
  Token Tmp;
2775
38.0k
  Tmp.startToken();
2776
2777
  // CurPtr - Cache BufferPtr in an automatic variable.
2778
38.0k
  const char *CurPtr = BufferPtr;
2779
738k
  while (true) {
2780
738k
    char Char = getAndAdvanceChar(CurPtr, Tmp);
2781
738k
    switch (Char) {
2782
700k
    default:
2783
700k
      if (Result)
2784
700k
        Result->push_back(Char);
2785
700k
      break;
2786
18
    case 0:  // Null.
2787
      // Found end of file?
2788
18
      if (CurPtr-1 != BufferEnd) {
2789
18
        if (isCodeCompletionPoint(CurPtr-1)) {
2790
18
          PP->CodeCompleteNaturalLanguage();
2791
18
          cutOffLexing();
2792
18
          return;
2793
18
        }
2794
2795
        // Nope, normal character, continue.
2796
0
        if (Result)
2797
0
          Result->push_back(Char);
2798
0
        break;
2799
18
      }
2800
      // FALL THROUGH.
2801
18
      
LLVM_FALLTHROUGH0
;0
2802
2
    case '\r':
2803
38.0k
    case '\n':
2804
      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2805
38.0k
      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2806
0
      BufferPtr = CurPtr-1;
2807
2808
      // Next, lex the character, which should handle the EOD transition.
2809
38.0k
      Lex(Tmp);
2810
38.0k
      if (Tmp.is(tok::code_completion)) {
2811
0
        if (PP)
2812
0
          PP->CodeCompleteNaturalLanguage();
2813
0
        Lex(Tmp);
2814
0
      }
2815
38.0k
      assert(Tmp.is(tok::eod) && "Unexpected token!");
2816
2817
      // Finally, we're done;
2818
0
      return;
2819
738k
    }
2820
738k
  }
2821
38.0k
}
2822
2823
/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2824
/// condition, reporting diagnostics and handling other edge cases as required.
2825
/// This returns true if Result contains a token, false if PP.Lex should be
2826
/// called again.
2827
1.58M
bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2828
  // If we hit the end of the file while parsing a preprocessor directive,
2829
  // end the preprocessor directive first.  The next token returned will
2830
  // then be the end of file.
2831
1.58M
  if (ParsingPreprocessorDirective) {
2832
    // Done parsing the "line".
2833
1.09k
    ParsingPreprocessorDirective = false;
2834
    // Update the location of token as well as BufferPtr.
2835
1.09k
    FormTokenWithChars(Result, CurPtr, tok::eod);
2836
2837
    // Restore comment saving mode, in case it was disabled for directive.
2838
1.09k
    if (PP)
2839
1.09k
      resetExtendedTokenMode();
2840
1.09k
    return true;  // Have a token.
2841
1.09k
  }
2842
2843
  // If we are in raw mode, return this event as an EOF token.  Let the caller
2844
  // that put us in raw mode handle the event.
2845
1.58M
  if (isLexingRawMode()) {
2846
161k
    Result.startToken();
2847
161k
    BufferPtr = BufferEnd;
2848
161k
    FormTokenWithChars(Result, BufferEnd, tok::eof);
2849
161k
    return true;
2850
161k
  }
2851
2852
1.42M
  if (PP->isRecordingPreamble() && 
PP->isInPrimaryFile()257
) {
2853
91
    PP->setRecordedPreambleConditionalStack(ConditionalStack);
2854
    // If the preamble cuts off the end of a header guard, consider it guarded.
2855
    // The guard is valid for the preamble content itself, and for tools the
2856
    // most useful answer is "yes, this file has a header guard".
2857
91
    if (!ConditionalStack.empty())
2858
7
      MIOpt.ExitTopLevelConditional();
2859
91
    ConditionalStack.clear();
2860
91
  }
2861
2862
  // Issue diagnostics for unterminated #if and missing newline.
2863
2864
  // If we are in a #if directive, emit an error.
2865
1.42M
  while (!ConditionalStack.empty()) {
2866
8
    if (PP->getCodeCompletionFileLoc() != FileLoc)
2867
8
      PP->Diag(ConditionalStack.back().IfLoc,
2868
8
               diag::err_pp_unterminated_conditional);
2869
8
    ConditionalStack.pop_back();
2870
8
  }
2871
2872
1.42M
  SourceLocation EndLoc = getSourceLocation(BufferEnd);
2873
  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2874
  // a pedwarn.
2875
1.42M
  if (CurPtr != BufferStart && 
(1.41M
CurPtr[-1] != '\n'1.41M
&&
CurPtr[-1] != '\r'26.6k
)) {
2876
26.6k
    DiagnosticsEngine &Diags = PP->getDiagnostics();
2877
26.6k
    unsigned DiagID;
2878
2879
26.6k
    if (LangOpts.CPlusPlus11) {
2880
      // C++11 [lex.phases] 2.2 p2
2881
      // Prefer the C++98 pedantic compatibility warning over the generic,
2882
      // non-extension, user-requested "missing newline at EOF" warning.
2883
19.8k
      if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2884
2
        DiagID = diag::warn_cxx98_compat_no_newline_eof;
2885
19.8k
      } else {
2886
19.8k
        DiagID = diag::warn_no_newline_eof;
2887
19.8k
      }
2888
19.8k
    } else {
2889
6.77k
      DiagID = diag::ext_no_newline_eof;
2890
6.77k
    }
2891
2892
26.6k
    Diag(BufferEnd, DiagID)
2893
26.6k
      << FixItHint::CreateInsertion(EndLoc, "\n");
2894
26.6k
  }
2895
2896
1.42M
  BufferPtr = CurPtr;
2897
2898
  // Finally, let the preprocessor handle this.
2899
1.42M
  return PP->HandleEndOfFile(Result, EndLoc, isPragmaLexer());
2900
1.58M
}
2901
2902
/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2903
/// the specified lexer will return a tok::l_paren token, 0 if it is something
2904
/// else and 2 if there are no more tokens in the buffer controlled by the
2905
/// lexer.
2906
3.00M
unsigned Lexer::isNextPPTokenLParen() {
2907
3.00M
  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2908
2909
  // Switch to 'skipping' mode.  This will ensure that we can lex a token
2910
  // without emitting diagnostics, disables macro expansion, and will cause EOF
2911
  // to return an EOF token instead of popping the include stack.
2912
0
  LexingRawMode = true;
2913
2914
  // Save state that can be changed while lexing so that we can restore it.
2915
3.00M
  const char *TmpBufferPtr = BufferPtr;
2916
3.00M
  bool inPPDirectiveMode = ParsingPreprocessorDirective;
2917
3.00M
  bool atStartOfLine = IsAtStartOfLine;
2918
3.00M
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2919
3.00M
  bool leadingSpace = HasLeadingSpace;
2920
2921
3.00M
  Token Tok;
2922
3.00M
  Lex(Tok);
2923
2924
  // Restore state that may have changed.
2925
3.00M
  BufferPtr = TmpBufferPtr;
2926
3.00M
  ParsingPreprocessorDirective = inPPDirectiveMode;
2927
3.00M
  HasLeadingSpace = leadingSpace;
2928
3.00M
  IsAtStartOfLine = atStartOfLine;
2929
3.00M
  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2930
2931
  // Restore the lexer back to non-skipping mode.
2932
3.00M
  LexingRawMode = false;
2933
2934
3.00M
  if (Tok.is(tok::eof))
2935
3
    return 2;
2936
3.00M
  return Tok.is(tok::l_paren);
2937
3.00M
}
2938
2939
/// Find the end of a version control conflict marker.
2940
static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2941
10
                                   ConflictMarkerKind CMK) {
2942
10
  const char *Terminator = CMK == CMK_Perforce ? 
"<<<<\n"5
:
">>>>>>>"5
;
2943
10
  size_t TermLen = CMK == CMK_Perforce ? 
55
:
75
;
2944
10
  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2945
10
  size_t Pos = RestOfBuffer.find(Terminator);
2946
11
  while (Pos != StringRef::npos) {
2947
    // Must occur at start of line.
2948
8
    if (Pos == 0 ||
2949
8
        
(7
RestOfBuffer[Pos - 1] != '\r'7
&&
RestOfBuffer[Pos - 1] != '\n'7
)) {
2950
1
      RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2951
1
      Pos = RestOfBuffer.find(Terminator);
2952
1
      continue;
2953
1
    }
2954
7
    return RestOfBuffer.data()+Pos;
2955
8
  }
2956
3
  return nullptr;
2957
10
}
2958
2959
/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2960
/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2961
/// and recover nicely.  This returns true if it is a conflict marker and false
2962
/// if not.
2963
18.5k
bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2964
  // Only a conflict marker if it starts at the beginning of a line.
2965
18.5k
  if (CurPtr != BufferStart &&
2966
18.5k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'18.5k
)
2967
18.5k
    return false;
2968
2969
  // Check to see if we have <<<<<<< or >>>>.
2970
28
  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2971
28
      
!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")19
)
2972
14
    return false;
2973
2974
  // If we have a situation where we don't care about conflict markers, ignore
2975
  // it.
2976
14
  if (CurrentConflictMarkerState || isLexingRawMode())
2977
9
    return false;
2978
2979
5
  ConflictMarkerKind Kind = *CurPtr == '<' ? 
CMK_Normal3
:
CMK_Perforce2
;
2980
2981
  // Check to see if there is an ending marker somewhere in the buffer at the
2982
  // start of a line to terminate this conflict marker.
2983
5
  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2984
    // We found a match.  We are really in a conflict marker.
2985
    // Diagnose this, and ignore to the end of line.
2986
4
    Diag(CurPtr, diag::err_conflict_marker);
2987
4
    CurrentConflictMarkerState = Kind;
2988
2989
    // Skip ahead to the end of line.  We know this exists because the
2990
    // end-of-conflict marker starts with \r or \n.
2991
76
    while (*CurPtr != '\r' && *CurPtr != '\n') {
2992
72
      assert(CurPtr != BufferEnd && "Didn't find end of line");
2993
0
      ++CurPtr;
2994
72
    }
2995
4
    BufferPtr = CurPtr;
2996
4
    return true;
2997
4
  }
2998
2999
  // No end of conflict marker found.
3000
1
  return false;
3001
5
}
3002
3003
/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3004
/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3005
/// is the end of a conflict marker.  Handle it by ignoring up until the end of
3006
/// the line.  This returns true if it is a conflict marker and false if not.
3007
18.6k
bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3008
  // Only a conflict marker if it starts at the beginning of a line.
3009
18.6k
  if (CurPtr != BufferStart &&
3010
18.6k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'18.5k
)
3011
18.5k
    return false;
3012
3013
  // If we have a situation where we don't care about conflict markers, ignore
3014
  // it.
3015
45
  if (!CurrentConflictMarkerState || 
isLexingRawMode()5
)
3016
40
    return false;
3017
3018
  // Check to see if we have the marker (4 characters in a row).
3019
20
  
for (unsigned i = 1; 5
i != 4;
++i15
)
3020
15
    if (CurPtr[i] != CurPtr[0])
3021
0
      return false;
3022
3023
  // If we do have it, search for the end of the conflict marker.  This could
3024
  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
3025
  // be the end of conflict marker.
3026
5
  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3027
5
                                        CurrentConflictMarkerState)) {
3028
3
    CurPtr = End;
3029
3030
    // Skip ahead to the end of line.
3031
37
    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3032
34
      ++CurPtr;
3033
3034
3
    BufferPtr = CurPtr;
3035
3036
    // No longer in the conflict marker.
3037
3
    CurrentConflictMarkerState = CMK_None;
3038
3
    return true;
3039
3
  }
3040
3041
2
  return false;
3042
5
}
3043
3044
static const char *findPlaceholderEnd(const char *CurPtr,
3045
43
                                      const char *BufferEnd) {
3046
43
  if (CurPtr == BufferEnd)
3047
0
    return nullptr;
3048
43
  BufferEnd -= 1; // Scan until the second last character.
3049
404
  for (; CurPtr != BufferEnd; 
++CurPtr361
) {
3050
404
    if (CurPtr[0] == '#' && 
CurPtr[1] == '>'43
)
3051
43
      return CurPtr + 2;
3052
404
  }
3053
0
  return nullptr;
3054
43
}
3055
3056
45
bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3057
45
  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3058
45
  if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || 
LexingRawMode44
)
3059
2
    return false;
3060
43
  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3061
43
  if (!End)
3062
0
    return false;
3063
43
  const char *Start = CurPtr - 1;
3064
43
  if (!LangOpts.AllowEditorPlaceholders)
3065
22
    Diag(Start, diag::err_placeholder_in_source);
3066
43
  Result.startToken();
3067
43
  FormTokenWithChars(Result, End, tok::raw_identifier);
3068
43
  Result.setRawIdentifierData(Start);
3069
43
  PP->LookUpIdentifierInfo(Result);
3070
43
  Result.setFlag(Token::IsEditorPlaceholder);
3071
43
  BufferPtr = End;
3072
43
  return true;
3073
43
}
3074
3075
616M
bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3076
616M
  if (
PP616M
&& PP->isCodeCompletionEnabled()) {
3077
1.05M
    SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3078
1.05M
    return Loc == PP->getCodeCompletionLoc();
3079
1.05M
  }
3080
3081
615M
  return false;
3082
616M
}
3083
3084
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3085
1.29k
                           Token *Result) {
3086
1.29k
  unsigned CharSize;
3087
1.29k
  char Kind = getCharAndSize(StartPtr, CharSize);
3088
3089
1.29k
  unsigned NumHexDigits;
3090
1.29k
  if (Kind == 'u')
3091
218
    NumHexDigits = 4;
3092
1.08k
  else if (Kind == 'U')
3093
36
    NumHexDigits = 8;
3094
1.04k
  else
3095
1.04k
    return 0;
3096
3097
254
  if (!LangOpts.CPlusPlus && 
!LangOpts.C99125
) {
3098
5
    if (Result && 
!isLexingRawMode()3
)
3099
3
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3100
5
    return 0;
3101
5
  }
3102
3103
249
  const char *CurPtr = StartPtr + CharSize;
3104
249
  const char *KindLoc = &CurPtr[-1];
3105
3106
249
  uint32_t CodePoint = 0;
3107
1.32k
  for (unsigned i = 0; i < NumHexDigits; 
++i1.07k
) {
3108
1.09k
    char C = getCharAndSize(CurPtr, CharSize);
3109
3110
1.09k
    unsigned Value = llvm::hexDigitValue(C);
3111
1.09k
    if (Value == -1U) {
3112
21
      if (Result && 
!isLexingRawMode()18
) {
3113
18
        if (i == 0) {
3114
6
          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
3115
6
            << StringRef(KindLoc, 1);
3116
12
        } else {
3117
12
          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
3118
3119
          // If the user wrote \U1234, suggest a fixit to \u.
3120
12
          if (i == 4 && 
NumHexDigits == 83
) {
3121
3
            CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3122
3
            Diag(KindLoc, diag::note_ucn_four_not_eight)
3123
3
              << FixItHint::CreateReplacement(URange, "u");
3124
3
          }
3125
12
        }
3126
18
      }
3127
3128
21
      return 0;
3129
21
    }
3130
3131
1.07k
    CodePoint <<= 4;
3132
1.07k
    CodePoint += Value;
3133
3134
1.07k
    CurPtr += CharSize;
3135
1.07k
  }
3136
3137
228
  if (Result) {
3138
108
    Result->setFlag(Token::HasUCN);
3139
108
    if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3140
0
      StartPtr = CurPtr;
3141
108
    else
3142
712
      
while (108
StartPtr != CurPtr)
3143
604
        (void)getAndAdvanceChar(StartPtr, *Result);
3144
120
  } else {
3145
120
    StartPtr = CurPtr;
3146
120
  }
3147
3148
  // Don't apply C family restrictions to UCNs in assembly mode
3149
228
  if (LangOpts.AsmPreprocessor)
3150
2
    return CodePoint;
3151
3152
  // C99 6.4.3p2: A universal character name shall not specify a character whose
3153
  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3154
  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
3155
  // C++11 [lex.charset]p2: If the hexadecimal value for a
3156
  //   universal-character-name corresponds to a surrogate code point (in the
3157
  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3158
  //   if the hexadecimal value for a universal-character-name outside the
3159
  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3160
  //   string literal corresponds to a control character (in either of the
3161
  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3162
  //   basic source character set, the program is ill-formed.
3163
226
  if (CodePoint < 0xA0) {
3164
27
    if (CodePoint == 0x24 || 
CodePoint == 0x4024
||
CodePoint == 0x6024
)
3165
3
      return CodePoint;
3166
3167
    // We don't use isLexingRawMode() here because we need to warn about bad
3168
    // UCNs even when skipping preprocessing tokens in a #if block.
3169
24
    if (Result && 
PP23
) {
3170
23
      if (CodePoint < 0x20 || 
CodePoint >= 0x7F15
)
3171
16
        Diag(BufferPtr, diag::err_ucn_control_character);
3172
7
      else {
3173
7
        char C = static_cast<char>(CodePoint);
3174
7
        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3175
7
      }
3176
23
    }
3177
3178
24
    return 0;
3179
199
  } else if (CodePoint >= 0xD800 && 
CodePoint <= 0xDFFF22
) {
3180
    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3181
    // We don't use isLexingRawMode() here because we need to diagnose bad
3182
    // UCNs even when skipping preprocessing tokens in a #if block.
3183
4
    if (Result && PP) {
3184
4
      if (LangOpts.CPlusPlus && 
!LangOpts.CPlusPlus112
)
3185
1
        Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3186
3
      else
3187
3
        Diag(BufferPtr, diag::err_ucn_escape_invalid);
3188
4
    }
3189
4
    return 0;
3190
4
  }
3191
3192
195
  return CodePoint;
3193
226
}
3194
3195
bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3196
208
                                   const char *CurPtr) {
3197
208
  if (!isLexingRawMode() && 
!PP->isPreprocessedOutput()152
&&
3198
208
      
isUnicodeWhitespace(C)124
) {
3199
6
    Diag(BufferPtr, diag::ext_unicode_whitespace)
3200
6
      << makeCharRange(*this, BufferPtr, CurPtr);
3201
3202
6
    Result.setFlag(Token::LeadingSpace);
3203
6
    return true;
3204
6
  }
3205
202
  return false;
3206
208
}
3207
3208
202
bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3209
202
  if (isAllowedInitiallyIDChar(C, LangOpts)) {
3210
124
    if (!isLexingRawMode() && 
!ParsingPreprocessorDirective97
&&
3211
124
        
!PP->isPreprocessedOutput()82
) {
3212
73
      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3213
73
                                makeCharRange(*this, BufferPtr, CurPtr),
3214
73
                                /*IsFirst=*/true);
3215
73
      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3216
73
                                 makeCharRange(*this, BufferPtr, CurPtr));
3217
73
    }
3218
3219
124
    MIOpt.ReadToken();
3220
124
    return LexIdentifier(Result, CurPtr);
3221
124
  }
3222
3223
78
  if (!isLexingRawMode() && 
!ParsingPreprocessorDirective49
&&
3224
78
      
!PP->isPreprocessedOutput()37
&&
!isASCII(*BufferPtr)20
&&
3225
78
      
!isAllowedInitiallyIDChar(C, LangOpts)13
&&
!isUnicodeWhitespace(C)13
) {
3226
    // Non-ASCII characters tend to creep into source code unintentionally.
3227
    // Instead of letting the parser complain about the unknown token,
3228
    // just drop the character.
3229
    // Note that we can /only/ do this when the non-ASCII character is actually
3230
    // spelled as Unicode, not written as a UCN. The standard requires that
3231
    // we not throw away any possible preprocessor tokens, but there's a
3232
    // loophole in the mapping of Unicode characters to basic character set
3233
    // characters that allows us to map these particular characters to, say,
3234
    // whitespace.
3235
13
    diagnoseInvalidUnicodeCodepointInIdentifier(
3236
13
        PP->getDiagnostics(), LangOpts, C,
3237
13
        makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
3238
13
    BufferPtr = CurPtr;
3239
13
    return false;
3240
13
  }
3241
3242
  // Otherwise, we have an explicit UCN or a character that's unlikely to show
3243
  // up by accident.
3244
65
  MIOpt.ReadToken();
3245
65
  FormTokenWithChars(Result, CurPtr, tok::unknown);
3246
65
  return true;
3247
78
}
3248
3249
36.7M
void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3250
36.7M
  IsAtStartOfLine = Result.isAtStartOfLine();
3251
36.7M
  HasLeadingSpace = Result.hasLeadingSpace();
3252
36.7M
  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3253
  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3254
36.7M
}
3255
3256
2.43G
bool Lexer::Lex(Token &Result) {
3257
  // Start a new token.
3258
2.43G
  Result.startToken();
3259
3260
  // Set up misc whitespace flags for LexTokenInternal.
3261
2.43G
  if (IsAtStartOfLine) {
3262
133M
    Result.setFlag(Token::StartOfLine);
3263
133M
    IsAtStartOfLine = false;
3264
133M
  }
3265
3266
2.43G
  if (HasLeadingSpace) {
3267
557k
    Result.setFlag(Token::LeadingSpace);
3268
557k
    HasLeadingSpace = false;
3269
557k
  }
3270
3271
2.43G
  if (HasLeadingEmptyMacro) {
3272
774k
    Result.setFlag(Token::LeadingEmptyMacro);
3273
774k
    HasLeadingEmptyMacro = false;
3274
774k
  }
3275
3276
2.43G
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3277
2.43G
  IsAtPhysicalStartOfLine = false;
3278
2.43G
  bool isRawLex = isLexingRawMode();
3279
2.43G
  (void) isRawLex;
3280
2.43G
  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3281
  // (After the LexTokenInternal call, the lexer might be destroyed.)
3282
2.43G
  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3283
0
  return returnedToken;
3284
2.43G
}
3285
3286
/// LexTokenInternal - This implements a simple C family lexer.  It is an
3287
/// extremely performance critical piece of code.  This assumes that the buffer
3288
/// has a null character at the end of the file.  This returns a preprocessing
3289
/// token, not a normal token, as such, it is an internal interface.  It assumes
3290
/// that the Flags of result have been cleared before calling this.
3291
2.43G
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3292
2.66G
LexNextToken:
3293
  // New token, can't need cleaning yet.
3294
2.66G
  Result.clearFlag(Token::NeedsCleaning);
3295
2.66G
  Result.setIdentifierInfo(nullptr);
3296
3297
  // CurPtr - Cache BufferPtr in an automatic variable.
3298
2.66G
  const char *CurPtr = BufferPtr;
3299
3300
  // Small amounts of horizontal whitespace is very common between tokens.
3301
2.66G
  if (isHorizontalWhitespace(*CurPtr)) {
3302
923M
    do {
3303
923M
      ++CurPtr;
3304
923M
    } while (isHorizontalWhitespace(*CurPtr));
3305
3306
    // If we are keeping whitespace and other tokens, just return what we just
3307
    // skipped.  The next lexer invocation will return the token after the
3308
    // whitespace.
3309
695M
    if (isKeepWhitespaceMode()) {
3310
281k
      FormTokenWithChars(Result, CurPtr, tok::unknown);
3311
      // FIXME: The next token will not have LeadingSpace set.
3312
281k
      return true;
3313
281k
    }
3314
3315
695M
    BufferPtr = CurPtr;
3316
695M
    Result.setFlag(Token::LeadingSpace);
3317
695M
  }
3318
3319
2.66G
  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3320
3321
  // Read a character, advancing over it.
3322
2.66G
  char Char = getAndAdvanceChar(CurPtr, Result);
3323
2.66G
  tok::TokenKind Kind;
3324
3325
2.66G
  if (!isVerticalWhitespace(Char))
3326
2.39G
    NewLinePtr = nullptr;
3327
3328
2.66G
  switch (Char) {
3329
1.58M
  case 0:  // Null.
3330
    // Found end of file?
3331
1.58M
    if (CurPtr-1 == BufferEnd)
3332
1.58M
      return LexEndOfFile(Result, CurPtr-1);
3333
3334
    // Check if we are performing code completion.
3335
1.12k
    if (isCodeCompletionPoint(CurPtr-1)) {
3336
      // Return the code-completion token.
3337
1.11k
      Result.startToken();
3338
1.11k
      FormTokenWithChars(Result, CurPtr, tok::code_completion);
3339
1.11k
      return true;
3340
1.11k
    }
3341
3342
5
    if (!isLexingRawMode())
3343
2
      Diag(CurPtr-1, diag::null_in_file);
3344
5
    Result.setFlag(Token::LeadingSpace);
3345
5
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3346
0
      return true; // KeepWhitespaceMode
3347
3348
    // We know the lexer hasn't changed, so just try again with this lexer.
3349
    // (We manually eliminate the tail call to avoid recursion.)
3350
5
    goto LexNextToken;
3351
3352
5
  case 26:  // DOS & CP/M EOF: "^Z".
3353
    // If we're in Microsoft extensions mode, treat this as end of file.
3354
1
    if (LangOpts.MicrosoftExt) {
3355
1
      if (!isLexingRawMode())
3356
1
        Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3357
1
      return LexEndOfFile(Result, CurPtr-1);
3358
1
    }
3359
3360
    // If Microsoft extensions are disabled, this is just random garbage.
3361
0
    Kind = tok::unknown;
3362
0
    break;
3363
3364
2.55k
  case '\r':
3365
2.55k
    if (CurPtr[0] == '\n')
3366
2.54k
      (void)getAndAdvanceChar(CurPtr, Result);
3367
2.55k
    LLVM_FALLTHROUGH;
3368
266M
  case '\n':
3369
    // If we are inside a preprocessor directive and we see the end of line,
3370
    // we know we are done with the directive, so return an EOD token.
3371
266M
    if (ParsingPreprocessorDirective) {
3372
      // Done parsing the "line".
3373
77.4M
      ParsingPreprocessorDirective = false;
3374
3375
      // Restore comment saving mode, in case it was disabled for directive.
3376
77.4M
      if (PP)
3377
77.4M
        resetExtendedTokenMode();
3378
3379
      // Since we consumed a newline, we are back at the start of a line.
3380
77.4M
      IsAtStartOfLine = true;
3381
77.4M
      IsAtPhysicalStartOfLine = true;
3382
77.4M
      NewLinePtr = CurPtr - 1;
3383
3384
77.4M
      Kind = tok::eod;
3385
77.4M
      break;
3386
77.4M
    }
3387
3388
    // No leading whitespace seen so far.
3389
189M
    Result.clearFlag(Token::LeadingSpace);
3390
3391
189M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3392
68.2k
      return true; // KeepWhitespaceMode
3393
3394
    // We only saw whitespace, so just try again with this lexer.
3395
    // (We manually eliminate the tail call to avoid recursion.)
3396
189M
    goto LexNextToken;
3397
189M
  case ' ':
3398
5.33M
  case '\t':
3399
5.33M
  case '\f':
3400
5.33M
  case '\v':
3401
7.11M
  SkipHorizontalWhitespace:
3402
7.11M
    Result.setFlag(Token::LeadingSpace);
3403
7.11M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3404
746
      return true; // KeepWhitespaceMode
3405
3406
53.0M
  SkipIgnoredUnits:
3407
53.0M
    CurPtr = BufferPtr;
3408
3409
    // If the next token is obviously a // or /* */ comment, skip it efficiently
3410
    // too (without going through the big switch stmt).
3411
53.0M
    if (CurPtr[0] == '/' && 
CurPtr[1] == '/'41.8M
&&
!inKeepCommentMode()41.7M
&&
3412
53.0M
        
LangOpts.LineComment41.7M
&&
3413
53.0M
        
(41.7M
LangOpts.CPlusPlus41.7M
||
!LangOpts.TraditionalCPP7.97M
)) {
3414
41.7M
      if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3415
0
        return true; // There is a token to return.
3416
41.7M
      goto SkipIgnoredUnits;
3417
41.7M
    } else 
if (11.2M
CurPtr[0] == '/'11.2M
&&
CurPtr[1] == '*'7.39k
&&
!inKeepCommentMode()4.71k
) {
3418
4.71k
      if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3419
0
        return true; // There is a token to return.
3420
4.71k
      goto SkipIgnoredUnits;
3421
11.2M
    } else if (isHorizontalWhitespace(*CurPtr)) {
3422
1.78M
      goto SkipHorizontalWhitespace;
3423
1.78M
    }
3424
    // We only saw whitespace, so just try again with this lexer.
3425
    // (We manually eliminate the tail call to avoid recursion.)
3426
9.47M
    goto LexNextToken;
3427
3428
  // C99 6.4.4.1: Integer Constants.
3429
  // C99 6.4.4.2: Floating Constants.
3430
49.2M
  
case '0': 7.79M
case '1': 30.4M
case '2': 40.4M
case '3': 45.3M
case '4':
3431
60.0M
  
case '5': 51.1M
case '6': 53.9M
case '7': 55.2M
case '8': 57.3M
case '9':
3432
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3433
60.0M
    MIOpt.ReadToken();
3434
60.0M
    return LexNumericConstant(Result, CurPtr);
3435
3436
22.7M
  case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3437
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3438
22.7M
    MIOpt.ReadToken();
3439
3440
22.7M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C1115.7M
) {
3441
22.0M
      Char = getCharAndSize(CurPtr, SizeTmp);
3442
3443
      // UTF-16 string literal
3444
22.0M
      if (Char == '"')
3445
156
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3446
156
                                tok::utf16_string_literal);
3447
3448
      // UTF-16 character constant
3449
22.0M
      if (Char == '\'')
3450
127
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3451
127
                               tok::utf16_char_constant);
3452
3453
      // UTF-16 raw string literal
3454
22.0M
      if (Char == 'R' && 
LangOpts.CPlusPlus1163
&&
3455
22.0M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'55
)
3456
53
        return LexRawStringLiteral(Result,
3457
53
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3458
53
                                           SizeTmp2, Result),
3459
53
                               tok::utf16_string_literal);
3460
3461
22.0M
      if (Char == '8') {
3462
4.69k
        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3463
3464
        // UTF-8 string literal
3465
4.69k
        if (Char2 == '"')
3466
329
          return LexStringLiteral(Result,
3467
329
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3468
329
                                           SizeTmp2, Result),
3469
329
                               tok::utf8_string_literal);
3470
4.37k
        if (Char2 == '\'' && 
LangOpts.CPlusPlus17175
)
3471
163
          return LexCharConstant(
3472
163
              Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3473
163
                                  SizeTmp2, Result),
3474
163
              tok::utf8_char_constant);
3475
3476
4.20k
        if (Char2 == 'R' && 
LangOpts.CPlusPlus1135
) {
3477
27
          unsigned SizeTmp3;
3478
27
          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3479
          // UTF-8 raw string literal
3480
27
          if (Char3 == '"') {
3481
25
            return LexRawStringLiteral(Result,
3482
25
                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3483
25
                                           SizeTmp2, Result),
3484
25
                               SizeTmp3, Result),
3485
25
                   tok::utf8_string_literal);
3486
25
          }
3487
27
        }
3488
4.20k
      }
3489
22.0M
    }
3490
3491
    // treat u like the start of an identifier.
3492
22.7M
    return LexIdentifier(Result, CurPtr);
3493
3494
1.96M
  case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
3495
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3496
1.96M
    MIOpt.ReadToken();
3497
3498
1.96M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C111.29M
) {
3499
1.89M
      Char = getCharAndSize(CurPtr, SizeTmp);
3500
3501
      // UTF-32 string literal
3502
1.89M
      if (Char == '"')
3503
153
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3504
153
                                tok::utf32_string_literal);
3505
3506
      // UTF-32 character constant
3507
1.89M
      if (Char == '\'')
3508
117
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3509
117
                               tok::utf32_char_constant);
3510
3511
      // UTF-32 raw string literal
3512
1.89M
      if (Char == 'R' && 
LangOpts.CPlusPlus1131.3k
&&
3513
1.89M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'3.76k
)
3514
27
        return LexRawStringLiteral(Result,
3515
27
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3516
27
                                           SizeTmp2, Result),
3517
27
                               tok::utf32_string_literal);
3518
1.89M
    }
3519
3520
    // treat U like the start of an identifier.
3521
1.96M
    return LexIdentifier(Result, CurPtr);
3522
3523
696k
  case 'R': // Identifier or C++0x raw string literal
3524
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3525
696k
    MIOpt.ReadToken();
3526
3527
696k
    if (LangOpts.CPlusPlus11) {
3528
186k
      Char = getCharAndSize(CurPtr, SizeTmp);
3529
3530
186k
      if (Char == '"')
3531
406
        return LexRawStringLiteral(Result,
3532
406
                                   ConsumeChar(CurPtr, SizeTmp, Result),
3533
406
                                   tok::string_literal);
3534
186k
    }
3535
3536
    // treat R like the start of an identifier.
3537
696k
    return LexIdentifier(Result, CurPtr);
3538
3539
672k
  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3540
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3541
672k
    MIOpt.ReadToken();
3542
672k
    Char = getCharAndSize(CurPtr, SizeTmp);
3543
3544
    // Wide string literal.
3545
672k
    if (Char == '"')
3546
1.56k
      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3547
1.56k
                              tok::wide_string_literal);
3548
3549
    // Wide raw string literal.
3550
670k
    if (LangOpts.CPlusPlus11 && 
Char == 'R'356k
&&
3551
670k
        
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'88
)
3552
19
      return LexRawStringLiteral(Result,
3553
19
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3554
19
                                           SizeTmp2, Result),
3555
19
                               tok::wide_string_literal);
3556
3557
    // Wide character constant.
3558
670k
    if (Char == '\'')
3559
1.26k
      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3560
1.26k
                             tok::wide_char_constant);
3561
    // FALL THROUGH, treating L like the start of an identifier.
3562
670k
    
LLVM_FALLTHROUGH669k
;669k
3563
3564
  // C99 6.4.2: Identifiers.
3565
17.6M
  
case 'A': 4.31M
case 'B': 5.44M
case 'C': 13.8M
case 'D': 15.4M
case 'E': 16.1M
case 'F': 17.1M
case 'G':
3566
26.8M
  
case 'H': 18.1M
case 'I': 20.5M
case 'J': 20.5M
case 'K': /*'L'*/21.0M
case 'M': 22.6M
case 'N':
3567
32.7M
  
case 'O': 28.1M
case 'P': 29.1M
case 'Q': /*'R'*/29.2M
case 'S': 31.3M
case 'T': /*'U'*/
3568
33.9M
  
case 'V': 33.2M
case 'W': 33.5M
case 'X': 33.8M
case 'Y': 33.9M
case 'Z':
3569
177M
  
case 'a': 42.5M
case 'b': 46.6M
case 'c': 77.5M
case 'd': 147M
case 'e': 167M
case 'f': 176M
case 'g':
3570
251M
  
case 'h': 179M
case 'i': 230M
case 'j': 230M
case 'k': 235M
case 'l': 239M
case 'm': 249M
case 'n':
3571
570M
  
case 'o': 396M
case 'p': 403M
case 'q': 403M
case 'r': 410M
case 's': 555M
case 't': /*'u'*/
3572
678M
  
case 'v': 674M
case 'w': 676M
case 'x': 678M
case 'y': 678M
case 'z':
3573
1.00G
  case '_':
3574
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3575
1.00G
    MIOpt.ReadToken();
3576
1.00G
    return LexIdentifier(Result, CurPtr);
3577
3578
57.5k
  case '$':   // $ in identifiers.
3579
57.5k
    if (LangOpts.DollarIdents) {
3580
57.4k
      if (!isLexingRawMode())
3581
56.5k
        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3582
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3583
57.4k
      MIOpt.ReadToken();
3584
57.4k
      return LexIdentifier(Result, CurPtr);
3585
57.4k
    }
3586
3587
13
    Kind = tok::unknown;
3588
13
    break;
3589
3590
  // C99 6.4.4: Character Constants.
3591
641k
  case '\'':
3592
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3593
641k
    MIOpt.ReadToken();
3594
641k
    return LexCharConstant(Result, CurPtr, tok::char_constant);
3595
3596
  // C99 6.4.5: String Literals.
3597
13.3M
  case '"':
3598
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3599
13.3M
    MIOpt.ReadToken();
3600
13.3M
    return LexStringLiteral(Result, CurPtr,
3601
13.3M
                            ParsingFilename ? 
tok::header_name68.1k
3602
13.3M
                                            : 
tok::string_literal13.2M
);
3603
3604
  // C99 6.4.6: Punctuators.
3605
191k
  case '?':
3606
191k
    Kind = tok::question;
3607
191k
    break;
3608
2.57M
  case '[':
3609
2.57M
    Kind = tok::l_square;
3610
2.57M
    break;
3611
2.58M
  case ']':
3612
2.58M
    Kind = tok::r_square;
3613
2.58M
    break;
3614
339M
  case '(':
3615
339M
    Kind = tok::l_paren;
3616
339M
    break;
3617
382M
  case ')':
3618
382M
    Kind = tok::r_paren;
3619
382M
    break;
3620
11.8M
  case '{':
3621
11.8M
    Kind = tok::l_brace;
3622
11.8M
    break;
3623
11.8M
  case '}':
3624
11.8M
    Kind = tok::r_brace;
3625
11.8M
    break;
3626
5.89M
  case '.':
3627
5.89M
    Char = getCharAndSize(CurPtr, SizeTmp);
3628
5.89M
    if (Char >= '0' && 
Char <= '9'3.49M
) {
3629
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3630
1.24k
      MIOpt.ReadToken();
3631
3632
1.24k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3633
5.88M
    } else if (LangOpts.CPlusPlus && 
Char == '*'3.80M
) {
3634
82.7k
      Kind = tok::periodstar;
3635
82.7k
      CurPtr += SizeTmp;
3636
5.80M
    } else if (Char == '.' &&
3637
5.80M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.'2.12M
) {
3638
2.12M
      Kind = tok::ellipsis;
3639
2.12M
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3640
2.12M
                           SizeTmp2, Result);
3641
3.68M
    } else {
3642
3.68M
      Kind = tok::period;
3643
3.68M
    }
3644
5.88M
    break;
3645
5.88M
  case '&':
3646
4.59M
    Char = getCharAndSize(CurPtr, SizeTmp);
3647
4.59M
    if (Char == '&') {
3648
1.85M
      Kind = tok::ampamp;
3649
1.85M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3650
2.73M
    } else if (Char == '=') {
3651
19.8k
      Kind = tok::ampequal;
3652
19.8k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3653
2.71M
    } else {
3654
2.71M
      Kind = tok::amp;
3655
2.71M
    }
3656
4.59M
    break;
3657
34.2M
  case '*':
3658
34.2M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3659
9.24k
      Kind = tok::starequal;
3660
9.24k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3661
34.2M
    } else {
3662
34.2M
      Kind = tok::star;
3663
34.2M
    }
3664
34.2M
    break;
3665
1.75M
  case '+':
3666
1.75M
    Char = getCharAndSize(CurPtr, SizeTmp);
3667
1.75M
    if (Char == '+') {
3668
610k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3669
610k
      Kind = tok::plusplus;
3670
1.14M
    } else if (Char == '=') {
3671
113k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3672
113k
      Kind = tok::plusequal;
3673
1.03M
    } else {
3674
1.03M
      Kind = tok::plus;
3675
1.03M
    }
3676
1.75M
    break;
3677
4.07M
  case '-':
3678
4.07M
    Char = getCharAndSize(CurPtr, SizeTmp);
3679
4.07M
    if (Char == '-') {      // --
3680
116k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3681
116k
      Kind = tok::minusminus;
3682
3.95M
    } else if (Char == '>' && 
LangOpts.CPlusPlus880k
&&
3683
3.95M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*'778k
) { // C++ ->*
3684
2.13k
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3685
2.13k
                           SizeTmp2, Result);
3686
2.13k
      Kind = tok::arrowstar;
3687
3.95M
    } else if (Char == '>') {   // ->
3688
878k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3689
878k
      Kind = tok::arrow;
3690
3.07M
    } else if (Char == '=') {   // -=
3691
57.8k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3692
57.8k
      Kind = tok::minusequal;
3693
3.01M
    } else {
3694
3.01M
      Kind = tok::minus;
3695
3.01M
    }
3696
4.07M
    break;
3697
223k
  case '~':
3698
223k
    Kind = tok::tilde;
3699
223k
    break;
3700
1.80M
  case '!':
3701
1.80M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3702
287k
      Kind = tok::exclaimequal;
3703
287k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3704
1.51M
    } else {
3705
1.51M
      Kind = tok::exclaim;
3706
1.51M
    }
3707
1.80M
    break;
3708
37.4M
  case '/':
3709
    // 6.4.9: Comments
3710
37.4M
    Char = getCharAndSize(CurPtr, SizeTmp);
3711
37.4M
    if (Char == '/') {         // Line comment.
3712
      // Even if Line comments are disabled (e.g. in C89 mode), we generally
3713
      // want to lex this as a comment.  There is one problem with this though,
3714
      // that in one particular corner case, this can change the behavior of the
3715
      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
3716
      // this as "foo / bar" and languages with Line comments would lex it as
3717
      // "foo".  Check to see if the character after the second slash is a '*'.
3718
      // If so, we will lex that as a "/" instead of the start of a comment.
3719
      // However, we never do this if we are just preprocessing.
3720
4.18M
      bool TreatAsComment = LangOpts.LineComment &&
3721
4.18M
                            
(4.17M
LangOpts.CPlusPlus4.17M
||
!LangOpts.TraditionalCPP1.53M
);
3722
4.18M
      if (!TreatAsComment)
3723
7.24k
        if (!(PP && 
PP->isPreprocessedOutput()6.96k
))
3724
7.15k
          TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3725
3726
4.18M
      if (TreatAsComment) {
3727
4.18M
        if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3728
4.18M
                            TokAtPhysicalStartOfLine))
3729
43.4k
          return true; // There is a token to return.
3730
3731
        // It is common for the tokens immediately after a // comment to be
3732
        // whitespace (indentation for the next line).  Instead of going through
3733
        // the big switch, handle it efficiently now.
3734
4.14M
        goto SkipIgnoredUnits;
3735
4.18M
      }
3736
4.18M
    }
3737
3738
33.3M
    if (Char == '*') {  // /**/ comment.
3739
32.7M
      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3740
32.7M
                           TokAtPhysicalStartOfLine))
3741
2.60k
        return true; // There is a token to return.
3742
3743
      // We only saw whitespace, so just try again with this lexer.
3744
      // (We manually eliminate the tail call to avoid recursion.)
3745
32.7M
      goto LexNextToken;
3746
32.7M
    }
3747
3748
593k
    if (Char == '=') {
3749
5.78k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3750
5.78k
      Kind = tok::slashequal;
3751
587k
    } else {
3752
587k
      Kind = tok::slash;
3753
587k
    }
3754
593k
    break;
3755
34.9k
  case '%':
3756
34.9k
    Char = getCharAndSize(CurPtr, SizeTmp);
3757
34.9k
    if (Char == '=') {
3758
3.31k
      Kind = tok::percentequal;
3759
3.31k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3760
31.6k
    } else if (LangOpts.Digraphs && 
Char == '>'27.6k
) {
3761
10
      Kind = tok::r_brace;                             // '%>' -> '}'
3762
10
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3763
31.6k
    } else if (LangOpts.Digraphs && 
Char == ':'27.6k
) {
3764
15
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3765
15
      Char = getCharAndSize(CurPtr, SizeTmp);
3766
15
      if (Char == '%' && 
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':'0
) {
3767
0
        Kind = tok::hashhash;                          // '%:%:' -> '##'
3768
0
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3769
0
                             SizeTmp2, Result);
3770
15
      } else if (Char == '@' && 
LangOpts.MicrosoftExt0
) {// %:@ -> #@ -> Charize
3771
0
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3772
0
        if (!isLexingRawMode())
3773
0
          Diag(BufferPtr, diag::ext_charize_microsoft);
3774
0
        Kind = tok::hashat;
3775
15
      } else {                                         // '%:' -> '#'
3776
        // We parsed a # character.  If this occurs at the start of the line,
3777
        // it's actually the start of a preprocessing directive.  Callback to
3778
        // the preprocessor to handle it.
3779
        // TODO: -fpreprocessed mode??
3780
15
        if (TokAtPhysicalStartOfLine && !LexingRawMode && 
!Is_PragmaLexer12
)
3781
12
          goto HandleDirective;
3782
3783
3
        Kind = tok::hash;
3784
3
      }
3785
31.5k
    } else {
3786
31.5k
      Kind = tok::percent;
3787
31.5k
    }
3788
34.9k
    break;
3789
14.0M
  case '<':
3790
14.0M
    Char = getCharAndSize(CurPtr, SizeTmp);
3791
14.0M
    if (ParsingFilename) {
3792
1.55M
      return LexAngledStringLiteral(Result, CurPtr);
3793
12.4M
    } else if (Char == '<') {
3794
512k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3795
512k
      if (After == '=') {
3796
3.15k
        Kind = tok::lesslessequal;
3797
3.15k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3798
3.15k
                             SizeTmp2, Result);
3799
508k
      } else if (After == '<' && 
IsStartOfConflictMarker(CurPtr-1)241
) {
3800
        // If this is actually a '<<<<<<<' version control conflict marker,
3801
        // recognize it as such and recover nicely.
3802
2
        goto LexNextToken;
3803
508k
      } else if (After == '<' && 
HandleEndOfConflictMarker(CurPtr-1)239
) {
3804
        // If this is '<<<<' and we're in a Perforce-style conflict marker,
3805
        // ignore it.
3806
0
        goto LexNextToken;
3807
508k
      } else if (LangOpts.CUDA && 
After == '<'162
) {
3808
139
        Kind = tok::lesslessless;
3809
139
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3810
139
                             SizeTmp2, Result);
3811
508k
      } else {
3812
508k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3813
508k
        Kind = tok::lessless;
3814
508k
      }
3815
11.9M
    } else if (Char == '=') {
3816
161k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3817
161k
      if (After == '>') {
3818
5.42k
        if (getLangOpts().CPlusPlus20) {
3819
937
          if (!isLexingRawMode())
3820
875
            Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3821
937
          CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3822
937
                               SizeTmp2, Result);
3823
937
          Kind = tok::spaceship;
3824
937
          break;
3825
937
        }
3826
        // Suggest adding a space between the '<=' and the '>' to avoid a
3827
        // change in semantics if this turns up in C++ <=17 mode.
3828
4.48k
        if (getLangOpts().CPlusPlus && 
!isLexingRawMode()4.48k
) {
3829
12
          Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
3830
12
            << FixItHint::CreateInsertion(
3831
12
                   getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3832
12
        }
3833
4.48k
      }
3834
160k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3835
160k
      Kind = tok::lessequal;
3836
11.7M
    } else if (LangOpts.Digraphs && 
Char == ':'11.3M
) { // '<:' -> '['
3837
106
      if (LangOpts.CPlusPlus11 &&
3838
106
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':'75
) {
3839
        // C++0x [lex.pptoken]p3:
3840
        //  Otherwise, if the next three characters are <:: and the subsequent
3841
        //  character is neither : nor >, the < is treated as a preprocessor
3842
        //  token by itself and not as the first character of the alternative
3843
        //  token <:.
3844
60
        unsigned SizeTmp3;
3845
60
        char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3846
60
        if (After != ':' && 
After != '>'59
) {
3847
58
          Kind = tok::less;
3848
58
          if (!isLexingRawMode())
3849
55
            Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3850
58
          break;
3851
58
        }
3852
60
      }
3853
3854
48
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3855
48
      Kind = tok::l_square;
3856
11.7M
    } else if (LangOpts.Digraphs && 
Char == '%'11.3M
) { // '<%' -> '{'
3857
9
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3858
9
      Kind = tok::l_brace;
3859
11.7M
    } else if (Char == '#' && /*Not a trigraph*/ 
SizeTmp == 147
&&
3860
11.7M
               
lexEditorPlaceholder(Result, CurPtr)45
) {
3861
43
      return true;
3862
11.7M
    } else {
3863
11.7M
      Kind = tok::less;
3864
11.7M
    }
3865
12.4M
    break;
3866
12.4M
  case '>':
3867
12.3M
    Char = getCharAndSize(CurPtr, SizeTmp);
3868
12.3M
    if (Char == '=') {
3869
538k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3870
538k
      Kind = tok::greaterequal;
3871
11.8M
    } else if (Char == '>') {
3872
285k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3873
285k
      if (After == '=') {
3874
2.09k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3875
2.09k
                             SizeTmp2, Result);
3876
2.09k
        Kind = tok::greatergreaterequal;
3877
283k
      } else if (After == '>' && 
IsStartOfConflictMarker(CurPtr-1)18.3k
) {
3878
        // If this is actually a '>>>>' conflict marker, recognize it as such
3879
        // and recover nicely.
3880
2
        goto LexNextToken;
3881
283k
      } else if (After == '>' && 
HandleEndOfConflictMarker(CurPtr-1)18.3k
) {
3882
        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3883
0
        goto LexNextToken;
3884
283k
      } else if (LangOpts.CUDA && 
After == '>'171
) {
3885
151
        Kind = tok::greatergreatergreater;
3886
151
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3887
151
                             SizeTmp2, Result);
3888
283k
      } else {
3889
283k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3890
283k
        Kind = tok::greatergreater;
3891
283k
      }
3892
11.5M
    } else {
3893
11.5M
      Kind = tok::greater;
3894
11.5M
    }
3895
12.3M
    break;
3896
12.3M
  case '^':
3897
118k
    Char = getCharAndSize(CurPtr, SizeTmp);
3898
118k
    if (Char == '=') {
3899
10.1k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3900
10.1k
      Kind = tok::caretequal;
3901
108k
    } else if (LangOpts.OpenCL && 
Char == '^'383
) {
3902
2
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3903
2
      Kind = tok::caretcaret;
3904
108k
    } else {
3905
108k
      Kind = tok::caret;
3906
108k
    }
3907
118k
    break;
3908
1.03M
  case '|':
3909
1.03M
    Char = getCharAndSize(CurPtr, SizeTmp);
3910
1.03M
    if (Char == '=') {
3911
32.8k
      Kind = tok::pipeequal;
3912
32.8k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3913
1.00M
    } else if (Char == '|') {
3914
      // If this is '|||||||' and we're in a conflict marker, ignore it.
3915
685k
      if (CurPtr[1] == '|' && 
HandleEndOfConflictMarker(CurPtr-1)19
)
3916
1
        goto LexNextToken;
3917
685k
      Kind = tok::pipepipe;
3918
685k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3919
685k
    } else {
3920
316k
      Kind = tok::pipe;
3921
316k
    }
3922
1.03M
    break;
3923
7.77M
  case ':':
3924
7.77M
    Char = getCharAndSize(CurPtr, SizeTmp);
3925
7.77M
    if (LangOpts.Digraphs && 
Char == '>'7.50M
) {
3926
21
      Kind = tok::r_square; // ':>' -> ']'
3927
21
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3928
7.77M
    } else if ((LangOpts.CPlusPlus ||
3929
7.77M
                
LangOpts.DoubleSquareBracketAttributes1.09M
) &&
3930
7.77M
               
Char == ':'6.68M
) {
3931
4.67M
      Kind = tok::coloncolon;
3932
4.67M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3933
4.67M
    } else {
3934
3.10M
      Kind = tok::colon;
3935
3.10M
    }
3936
7.77M
    break;
3937
69.6M
  case ';':
3938
69.6M
    Kind = tok::semi;
3939
69.6M
    break;
3940
20.5M
  case '=':
3941
20.5M
    Char = getCharAndSize(CurPtr, SizeTmp);
3942
20.5M
    if (Char == '=') {
3943
      // If this is '====' and we're in a conflict marker, ignore it.
3944
648k
      if (CurPtr[1] == '=' && 
HandleEndOfConflictMarker(CurPtr-1)38
)
3945
2
        goto LexNextToken;
3946
3947
648k
      Kind = tok::equalequal;
3948
648k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3949
19.9M
    } else {
3950
19.9M
      Kind = tok::equal;
3951
19.9M
    }
3952
20.5M
    break;
3953
234M
  case ',':
3954
234M
    Kind = tok::comma;
3955
234M
    break;
3956
90.0M
  case '#':
3957
90.0M
    Char = getCharAndSize(CurPtr, SizeTmp);
3958
90.0M
    if (Char == '#') {
3959
246k
      Kind = tok::hashhash;
3960
246k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3961
89.7M
    } else if (Char == '@' && 
LangOpts.MicrosoftExt3
) { // #@ -> Charize
3962
3
      Kind = tok::hashat;
3963
3
      if (!isLexingRawMode())
3964
3
        Diag(BufferPtr, diag::ext_charize_microsoft);
3965
3
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3966
89.7M
    } else {
3967
      // We parsed a # character.  If this occurs at the start of the line,
3968
      // it's actually the start of a preprocessing directive.  Callback to
3969
      // the preprocessor to handle it.
3970
      // TODO: -fpreprocessed mode??
3971
89.7M
      if (TokAtPhysicalStartOfLine && 
!LexingRawMode89.6M
&&
!Is_PragmaLexer67.2M
)
3972
67.2M
        goto HandleDirective;
3973
3974
22.4M
      Kind = tok::hash;
3975
22.4M
    }
3976
22.7M
    break;
3977
3978
22.7M
  case '@':
3979
    // Objective C support.
3980
752k
    if (CurPtr[-1] == '@' && LangOpts.ObjC)
3981
751k
      Kind = tok::at;
3982
1.13k
    else
3983
1.13k
      Kind = tok::unknown;
3984
752k
    break;
3985
3986
  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3987
1.15k
  case '\\':
3988
1.15k
    if (!LangOpts.AsmPreprocessor) {
3989
1.15k
      if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3990
81
        if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3991
0
          if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3992
0
            return true; // KeepWhitespaceMode
3993
3994
          // We only saw whitespace, so just try again with this lexer.
3995
          // (We manually eliminate the tail call to avoid recursion.)
3996
0
          goto LexNextToken;
3997
0
        }
3998
3999
81
        return LexUnicode(Result, CodePoint, CurPtr);
4000
81
      }
4001
1.15k
    }
4002
4003
1.07k
    Kind = tok::unknown;
4004
1.07k
    break;
4005
4006
358
  default: {
4007
358
    if (isASCII(Char)) {
4008
165
      Kind = tok::unknown;
4009
165
      break;
4010
165
    }
4011
4012
193
    llvm::UTF32 CodePoint;
4013
4014
    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4015
    // an escaped newline.
4016
193
    --CurPtr;
4017
193
    llvm::ConversionResult Status =
4018
193
        llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4019
193
                                  (const llvm::UTF8 *)BufferEnd,
4020
193
                                  &CodePoint,
4021
193
                                  llvm::strictConversion);
4022
193
    if (Status == llvm::conversionOK) {
4023
127
      if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4024
6
        if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4025
0
          return true; // KeepWhitespaceMode
4026
4027
        // We only saw whitespace, so just try again with this lexer.
4028
        // (We manually eliminate the tail call to avoid recursion.)
4029
6
        goto LexNextToken;
4030
6
      }
4031
121
      return LexUnicode(Result, CodePoint, CurPtr);
4032
127
    }
4033
4034
66
    if (isLexingRawMode() || 
ParsingPreprocessorDirective4
||
4035
66
        
PP->isPreprocessedOutput()2
) {
4036
65
      ++CurPtr;
4037
65
      Kind = tok::unknown;
4038
65
      break;
4039
65
    }
4040
4041
    // Non-ASCII characters tend to creep into source code unintentionally.
4042
    // Instead of letting the parser complain about the unknown token,
4043
    // just diagnose the invalid UTF-8, then drop the character.
4044
1
    Diag(CurPtr, diag::err_invalid_utf8);
4045
4046
1
    BufferPtr = CurPtr+1;
4047
    // We're pretending the character didn't exist, so just try again with
4048
    // this lexer.
4049
    // (We manually eliminate the tail call to avoid recursion.)
4050
1
    goto LexNextToken;
4051
66
  }
4052
2.66G
  }
4053
4054
  // Notify MIOpt that we read a non-whitespace/non-comment token.
4055
1.26G
  MIOpt.ReadToken();
4056
4057
  // Update the location of token as well as BufferPtr.
4058
1.26G
  FormTokenWithChars(Result, CurPtr, Kind);
4059
1.26G
  return true;
4060
4061
67.2M
HandleDirective:
4062
  // We parsed a # character and it's the start of a preprocessing directive.
4063
4064
67.2M
  FormTokenWithChars(Result, CurPtr, tok::hash);
4065
67.2M
  PP->HandleDirective(Result);
4066
4067
67.2M
  if (PP->hadModuleLoaderFatalFailure()) {
4068
    // With a fatal failure in the module loader, we abort parsing.
4069
2
    assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
4070
0
    return true;
4071
2
  }
4072
4073
  // We parsed the directive; lex a token with the new state.
4074
67.2M
  return false;
4075
67.2M
}