Coverage Report

Created: 2020-09-22 08:39

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/Lexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file implements the Lexer and Token interfaces.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "clang/Lex/Lexer.h"
14
#include "UnicodeCharSets.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/IdentifierTable.h"
17
#include "clang/Basic/LangOptions.h"
18
#include "clang/Basic/SourceLocation.h"
19
#include "clang/Basic/SourceManager.h"
20
#include "clang/Basic/TokenKinds.h"
21
#include "clang/Lex/LexDiagnostic.h"
22
#include "clang/Lex/LiteralSupport.h"
23
#include "clang/Lex/MultipleIncludeOpt.h"
24
#include "clang/Lex/Preprocessor.h"
25
#include "clang/Lex/PreprocessorOptions.h"
26
#include "clang/Lex/Token.h"
27
#include "clang/Basic/Diagnostic.h"
28
#include "clang/Basic/LLVM.h"
29
#include "clang/Basic/TokenKinds.h"
30
#include "llvm/ADT/None.h"
31
#include "llvm/ADT/Optional.h"
32
#include "llvm/ADT/STLExtras.h"
33
#include "llvm/ADT/StringExtras.h"
34
#include "llvm/ADT/StringSwitch.h"
35
#include "llvm/ADT/StringRef.h"
36
#include "llvm/Support/Compiler.h"
37
#include "llvm/Support/ConvertUTF.h"
38
#include "llvm/Support/MathExtras.h"
39
#include "llvm/Support/MemoryBuffer.h"
40
#include "llvm/Support/NativeFormatting.h"
41
#include "llvm/Support/UnicodeCharRanges.h"
42
#include <algorithm>
43
#include <cassert>
44
#include <cstddef>
45
#include <cstdint>
46
#include <cstring>
47
#include <string>
48
#include <tuple>
49
#include <utility>
50
51
using namespace clang;
52
53
//===----------------------------------------------------------------------===//
54
// Token Class Implementation
55
//===----------------------------------------------------------------------===//
56
57
/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
58
540k
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
59
540k
  if (isAnnotation())
60
2
    return false;
61
540k
  if (IdentifierInfo *II = getIdentifierInfo())
62
530k
    return II->getObjCKeywordID() == objcKey;
63
10.3k
  return false;
64
10.3k
}
65
66
/// getObjCKeywordID - Return the ObjC keyword kind.
67
1.14M
tok::ObjCKeywordKind Token::getObjCKeywordID() const {
68
1.14M
  if (isAnnotation())
69
1
    return tok::objc_not_keyword;
70
1.14M
  IdentifierInfo *specId = getIdentifierInfo();
71
1.03M
  return specId ? specId->getObjCKeywordID() : 
tok::objc_not_keyword106k
;
72
1.14M
}
73
74
//===----------------------------------------------------------------------===//
75
// Lexer Class Implementation
76
//===----------------------------------------------------------------------===//
77
78
0
void Lexer::anchor() {}
79
80
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
81
43.3M
                      const char *BufEnd) {
82
43.3M
  BufferStart = BufStart;
83
43.3M
  BufferPtr = BufPtr;
84
43.3M
  BufferEnd = BufEnd;
85
86
43.3M
  assert(BufEnd[0] == 0 &&
87
43.3M
         "We assume that the input buffer has a null character at the end"
88
43.3M
         " to simplify lexing!");
89
90
  // Check whether we have a BOM in the beginning of the buffer. If yes - act
91
  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
92
  // skip the UTF-8 BOM if it's present.
93
43.3M
  if (BufferStart == BufferPtr) {
94
    // Determine the size of the BOM.
95
1.82M
    StringRef Buf(BufferStart, BufferEnd - BufferStart);
96
1.82M
    size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
97
1.82M
      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
98
1.82M
      .Default(0);
99
100
    // Skip the BOM.
101
1.82M
    BufferPtr += BOMLength;
102
1.82M
  }
103
104
43.3M
  Is_PragmaLexer = false;
105
43.3M
  CurrentConflictMarkerState = CMK_None;
106
107
  // Start of the file is a start of line.
108
43.3M
  IsAtStartOfLine = true;
109
43.3M
  IsAtPhysicalStartOfLine = true;
110
111
43.3M
  HasLeadingSpace = false;
112
43.3M
  HasLeadingEmptyMacro = false;
113
114
  // We are not after parsing a #.
115
43.3M
  ParsingPreprocessorDirective = false;
116
117
  // We are not after parsing #include.
118
43.3M
  ParsingFilename = false;
119
120
  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
121
  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
122
  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
123
  // or otherwise skipping over tokens.
124
43.3M
  LexingRawMode = false;
125
126
  // Default to not keeping comments.
127
43.3M
  ExtendedTokenMode = 0;
128
129
43.3M
  NewLinePtr = nullptr;
130
43.3M
}
131
132
/// Lexer constructor - Create a new lexer object for the specified buffer
133
/// with the specified preprocessor managing the lexing process.  This lexer
134
/// assumes that the associated file buffer and Preprocessor objects will
135
/// outlive it, so it doesn't take ownership of either of them.
136
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
137
    : PreprocessorLexer(&PP, FID),
138
      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
139
1.69M
      LangOpts(PP.getLangOpts()) {
140
1.69M
  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
141
1.69M
            InputFile->getBufferEnd());
142
143
1.69M
  resetExtendedTokenMode();
144
1.69M
}
145
146
/// Lexer constructor - Create a new raw lexer object.  This object is only
147
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
148
/// range will outlive it, so it doesn't take ownership of it.
149
Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
150
             const char *BufStart, const char *BufPtr, const char *BufEnd)
151
41.6M
    : FileLoc(fileloc), LangOpts(langOpts) {
152
41.6M
  InitLexer(BufStart, BufPtr, BufEnd);
153
154
  // We *are* in raw mode.
155
41.6M
  LexingRawMode = true;
156
41.6M
}
157
158
/// Lexer constructor - Create a new raw lexer object.  This object is only
159
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
160
/// range will outlive it, so it doesn't take ownership of it.
161
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
162
             const SourceManager &SM, const LangOptions &langOpts)
163
    : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
164
58.4k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
Unexecuted instantiation: clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
Line
Count
Source
164
58.4k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
165
166
174M
void Lexer::resetExtendedTokenMode() {
167
174M
  assert(PP && "Cannot reset token mode without a preprocessor");
168
174M
  if (LangOpts.TraditionalCPP)
169
1.05k
    SetKeepWhitespaceMode(true);
170
174M
  else
171
174M
    SetCommentRetentionState(PP->getCommentRetentionState());
172
174M
}
173
174
/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
175
/// _Pragma expansion.  This has a variety of magic semantics that this method
176
/// sets up.  It returns a new'd Lexer that must be delete'd when done.
177
///
178
/// On entrance to this routine, TokStartLoc is a macro location which has a
179
/// spelling loc that indicates the bytes to be lexed for the token and an
180
/// expansion location that indicates where all lexed tokens should be
181
/// "expanded from".
182
///
183
/// TODO: It would really be nice to make _Pragma just be a wrapper around a
184
/// normal lexer that remaps tokens as they fly by.  This would require making
185
/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
186
/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
187
/// out of the critical path of the lexer!
188
///
189
Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
190
                                 SourceLocation ExpansionLocStart,
191
                                 SourceLocation ExpansionLocEnd,
192
620k
                                 unsigned TokLen, Preprocessor &PP) {
193
620k
  SourceManager &SM = PP.getSourceManager();
194
195
  // Create the lexer as if we were going to lex the file normally.
196
620k
  FileID SpellingFID = SM.getFileID(SpellingLoc);
197
620k
  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
198
620k
  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
199
200
  // Now that the lexer is created, change the start/end locations so that we
201
  // just lex the subsection of the file that we want.  This is lexing from a
202
  // scratch buffer.
203
620k
  const char *StrData = SM.getCharacterData(SpellingLoc);
204
205
620k
  L->BufferPtr = StrData;
206
620k
  L->BufferEnd = StrData+TokLen;
207
620k
  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
208
209
  // Set the SourceLocation with the remapping information.  This ensures that
210
  // GetMappedTokenLoc will remap the tokens as they are lexed.
211
620k
  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
212
620k
                                     ExpansionLocStart,
213
620k
                                     ExpansionLocEnd, TokLen);
214
215
  // Ensure that the lexer thinks it is inside a directive, so that end \n will
216
  // return an EOD token.
217
620k
  L->ParsingPreprocessorDirective = true;
218
219
  // This lexer really is for _Pragma.
220
620k
  L->Is_PragmaLexer = true;
221
620k
  return L;
222
620k
}
223
224
20
bool Lexer::skipOver(unsigned NumBytes) {
225
20
  IsAtPhysicalStartOfLine = true;
226
20
  IsAtStartOfLine = true;
227
20
  if ((BufferPtr + NumBytes) > BufferEnd)
228
0
    return true;
229
20
  BufferPtr += NumBytes;
230
20
  return false;
231
20
}
232
233
483
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
483
  typename T::size_type i = 0, e = Str.size();
235
41.5k
  while (i < e) {
236
41.0k
    if (Str[i] == '\\' || 
Str[i] == Quote41.0k
) {
237
222
      Str.insert(Str.begin() + i, '\\');
238
222
      i += 2;
239
222
      ++e;
240
40.8k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'40.7k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
17
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'13
) &&
243
4
          Str[i] != Str[i + 1]) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
17
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
17
        Str[i] = '\\';
249
17
        Str.insert(Str.begin() + i + 1, 'n');
250
17
        ++e;
251
17
      }
252
17
      i += 2;
253
17
    } else
254
40.7k
      ++i;
255
41.0k
  }
256
483
}
Lexer.cpp:void StringifyImpl<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, char)
Line
Count
Source
233
109
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
109
  typename T::size_type i = 0, e = Str.size();
235
2.72k
  while (i < e) {
236
2.61k
    if (Str[i] == '\\' || 
Str[i] == Quote2.60k
) {
237
211
      Str.insert(Str.begin() + i, '\\');
238
211
      i += 2;
239
211
      ++e;
240
2.40k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'2.39k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
9
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'7
) &&
243
2
          Str[i] != Str[i + 1]) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
9
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
9
        Str[i] = '\\';
249
9
        Str.insert(Str.begin() + i + 1, 'n');
250
9
        ++e;
251
9
      }
252
9
      i += 2;
253
9
    } else
254
2.39k
      ++i;
255
2.61k
  }
256
109
}
Lexer.cpp:void StringifyImpl<llvm::SmallVectorImpl<char> >(llvm::SmallVectorImpl<char>&, char)
Line
Count
Source
233
374
template <typename T> static void StringifyImpl(T &Str, char Quote) {
234
374
  typename T::size_type i = 0, e = Str.size();
235
38.7k
  while (i < e) {
236
38.4k
    if (Str[i] == '\\' || 
Str[i] == Quote38.4k
) {
237
11
      Str.insert(Str.begin() + i, '\\');
238
11
      i += 2;
239
11
      ++e;
240
38.4k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'38.3k
) {
241
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
242
8
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'6
) &&
243
2
          Str[i] != Str[i + 1]) {
244
0
        Str[i] = '\\';
245
0
        Str[i + 1] = 'n';
246
8
      } else {
247
        // Replace '\n' and '\r' to '\\' followed by 'n'.
248
8
        Str[i] = '\\';
249
8
        Str.insert(Str.begin() + i + 1, 'n');
250
8
        ++e;
251
8
      }
252
8
      i += 2;
253
8
    } else
254
38.3k
      ++i;
255
38.4k
  }
256
374
}
257
258
109
std::string Lexer::Stringify(StringRef Str, bool Charify) {
259
109
  std::string Result = std::string(Str);
260
109
  char Quote = Charify ? 
'\''0
: '"';
261
109
  StringifyImpl(Result, Quote);
262
109
  return Result;
263
109
}
264
265
374
void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
266
267
//===----------------------------------------------------------------------===//
268
// Token Spelling
269
//===----------------------------------------------------------------------===//
270
271
/// Slow case of getSpelling. Extract the characters comprising the
272
/// spelling of this token from the provided input buffer.
273
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
274
10.5k
                              const LangOptions &LangOpts, char *Spelling) {
275
10.5k
  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
276
277
10.5k
  size_t Length = 0;
278
10.5k
  const char *BufEnd = BufPtr + Tok.getLength();
279
280
10.5k
  if (tok::isStringLiteral(Tok.getKind())) {
281
    // Munch the encoding-prefix and opening double-quote.
282
368
    while (BufPtr < BufEnd) {
283
368
      unsigned Size;
284
368
      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
285
368
      BufPtr += Size;
286
287
368
      if (Spelling[Length - 1] == '"')
288
345
        break;
289
368
    }
290
291
    // Raw string literals need special handling; trigraph expansion and line
292
    // splicing do not occur within their d-char-sequence nor within their
293
    // r-char-sequence.
294
345
    if (Length >= 2 &&
295
11
        Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
296
      // Search backwards from the end of the token to find the matching closing
297
      // quote.
298
11
      const char *RawEnd = BufEnd;
299
20
      do --RawEnd; while (*RawEnd != '"');
300
11
      size_t RawLength = RawEnd - BufPtr + 1;
301
302
      // Everything between the quotes is included verbatim in the spelling.
303
11
      memcpy(Spelling + Length, BufPtr, RawLength);
304
11
      Length += RawLength;
305
11
      BufPtr += RawLength;
306
307
      // The rest of the token is lexed normally.
308
11
    }
309
345
  }
310
311
131k
  while (BufPtr < BufEnd) {
312
120k
    unsigned Size;
313
120k
    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
314
120k
    BufPtr += Size;
315
120k
  }
316
317
10.5k
  assert(Length < Tok.getLength() &&
318
10.5k
         "NeedsCleaning flag set on token that didn't need cleaning!");
319
10.5k
  return Length;
320
10.5k
}
321
322
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
323
/// token are the characters used to represent the token in the source file
324
/// after trigraph expansion and escaped-newline folding.  In particular, this
325
/// wants to get the true, uncanonicalized, spelling of things like digraphs
326
/// UCNs, etc.
327
StringRef Lexer::getSpelling(SourceLocation loc,
328
                             SmallVectorImpl<char> &buffer,
329
                             const SourceManager &SM,
330
                             const LangOptions &options,
331
1.32k
                             bool *invalid) {
332
  // Break down the source location.
333
1.32k
  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
334
335
  // Try to the load the file buffer.
336
1.32k
  bool invalidTemp = false;
337
1.32k
  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
338
1.32k
  if (invalidTemp) {
339
0
    if (invalid) *invalid = true;
340
0
    return {};
341
0
  }
342
343
1.32k
  const char *tokenBegin = file.data() + locInfo.second;
344
345
  // Lex from the start of the given location.
346
1.32k
  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
347
1.32k
              file.begin(), tokenBegin, file.end());
348
1.32k
  Token token;
349
1.32k
  lexer.LexFromRawLexer(token);
350
351
1.32k
  unsigned length = token.getLength();
352
353
  // Common case:  no need for cleaning.
354
1.32k
  if (!token.needsCleaning())
355
1.32k
    return StringRef(tokenBegin, length);
356
357
  // Hard case, we need to relex the characters into the string.
358
1
  buffer.resize(length);
359
1
  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
360
1
  return StringRef(buffer.data(), buffer.size());
361
1
}
362
363
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
364
/// token are the characters used to represent the token in the source file
365
/// after trigraph expansion and escaped-newline folding.  In particular, this
366
/// wants to get the true, uncanonicalized, spelling of things like digraphs
367
/// UCNs, etc.
368
std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
369
2.46M
                               const LangOptions &LangOpts, bool *Invalid) {
370
2.46M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
371
372
2.46M
  bool CharDataInvalid = false;
373
2.46M
  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
374
2.46M
                                                    &CharDataInvalid);
375
2.46M
  if (Invalid)
376
105
    *Invalid = CharDataInvalid;
377
2.46M
  if (CharDataInvalid)
378
0
    return {};
379
380
  // If this token contains nothing interesting, return it directly.
381
2.46M
  if (!Tok.needsCleaning())
382
2.46M
    return std::string(TokStart, TokStart + Tok.getLength());
383
384
5
  std::string Result;
385
5
  Result.resize(Tok.getLength());
386
5
  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
387
5
  return Result;
388
5
}
389
390
/// getSpelling - This method is used to get the spelling of a token into a
391
/// preallocated buffer, instead of as an std::string.  The caller is required
392
/// to allocate enough space for the token, which is guaranteed to be at least
393
/// Tok.getLength() bytes long.  The actual length of the token is returned.
394
///
395
/// Note that this method may do two possible things: it may either fill in
396
/// the buffer specified with characters, or it may *change the input pointer*
397
/// to point to a constant buffer with the data already in it (avoiding a
398
/// copy).  The caller is not allowed to modify the returned buffer pointer
399
/// if an internal buffer is returned.
400
unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
401
                            const SourceManager &SourceMgr,
402
61.4M
                            const LangOptions &LangOpts, bool *Invalid) {
403
61.4M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
404
405
61.4M
  const char *TokStart = nullptr;
406
  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
407
61.4M
  if (Tok.is(tok::raw_identifier))
408
2.50M
    TokStart = Tok.getRawIdentifier().data();
409
58.9M
  else if (!Tok.hasUCN()) {
410
58.9M
    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
411
      // Just return the string from the identifier table, which is very quick.
412
31.9M
      Buffer = II->getNameStart();
413
31.9M
      return II->getLength();
414
31.9M
    }
415
29.5M
  }
416
417
  // NOTE: this can be checked even after testing for an IdentifierInfo.
418
29.5M
  if (Tok.isLiteral())
419
26.4M
    TokStart = Tok.getLiteralData();
420
421
29.5M
  if (!TokStart) {
422
    // Compute the start of the token in the input lexer buffer.
423
696k
    bool CharDataInvalid = false;
424
696k
    TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
425
696k
    if (Invalid)
426
204k
      *Invalid = CharDataInvalid;
427
696k
    if (CharDataInvalid) {
428
0
      Buffer = "";
429
0
      return 0;
430
0
    }
431
29.5M
  }
432
433
  // If this token contains nothing interesting, return it directly.
434
29.5M
  if (!Tok.needsCleaning()) {
435
29.5M
    Buffer = TokStart;
436
29.5M
    return Tok.getLength();
437
29.5M
  }
438
439
  // Otherwise, hard case, relex the characters into the string.
440
10.5k
  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
441
10.5k
}
442
443
/// MeasureTokenLength - Relex the token at the specified location and return
444
/// its length in bytes in the input file.  If the token needs cleaning (e.g.
445
/// includes a trigraph or an escaped newline) then this count includes bytes
446
/// that are part of that.
447
unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
448
                                   const SourceManager &SM,
449
40.7M
                                   const LangOptions &LangOpts) {
450
40.7M
  Token TheTok;
451
40.7M
  if (getRawToken(Loc, TheTok, SM, LangOpts))
452
1.47k
    return 0;
453
40.7M
  return TheTok.getLength();
454
40.7M
}
455
456
/// Relex the token at the specified location.
457
/// \returns true if there was a failure, false on success.
458
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
459
                        const SourceManager &SM,
460
                        const LangOptions &LangOpts,
461
40.7M
                        bool IgnoreWhiteSpace) {
462
  // TODO: this could be special cased for common tokens like identifiers, ')',
463
  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
464
  // all obviously single-char tokens.  This could use
465
  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
466
  // something.
467
468
  // If this comes from a macro expansion, we really do want the macro name, not
469
  // the token this macro expanded to.
470
40.7M
  Loc = SM.getExpansionLoc(Loc);
471
40.7M
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
472
40.7M
  bool Invalid = false;
473
40.7M
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
474
40.7M
  if (Invalid)
475
16
    return true;
476
477
40.7M
  const char *StrData = Buffer.data()+LocInfo.second;
478
479
40.7M
  if (!IgnoreWhiteSpace && 
isWhitespace(StrData[0])40.7M
)
480
1.46k
    return true;
481
482
  // Create a lexer starting at the beginning of this token.
483
40.7M
  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
484
40.7M
                 Buffer.begin(), StrData, Buffer.end());
485
40.7M
  TheLexer.SetCommentRetentionState(true);
486
40.7M
  TheLexer.LexFromRawLexer(Result);
487
40.7M
  return false;
488
40.7M
}
489
490
/// Returns the pointer that points to the beginning of line that contains
491
/// the given offset, or null if the offset if invalid.
492
10.7k
static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
493
10.7k
  const char *BufStart = Buffer.data();
494
10.7k
  if (Offset >= Buffer.size())
495
6
    return nullptr;
496
497
10.6k
  const char *LexStart = BufStart + Offset;
498
284k
  for (; LexStart != BufStart; 
--LexStart273k
) {
499
283k
    if (isVerticalWhitespace(LexStart[0]) &&
500
10.3k
        !Lexer::isNewLineEscaped(BufStart, LexStart)) {
501
      // LexStart should point at first character of logical line.
502
10.3k
      ++LexStart;
503
10.3k
      break;
504
10.3k
    }
505
283k
  }
506
10.6k
  return LexStart;
507
10.6k
}
508
509
static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
510
                                              const SourceManager &SM,
511
10.5k
                                              const LangOptions &LangOpts) {
512
10.5k
  assert(Loc.isFileID());
513
10.5k
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
514
10.5k
  if (LocInfo.first.isInvalid())
515
0
    return Loc;
516
517
10.5k
  bool Invalid = false;
518
10.5k
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
519
10.5k
  if (Invalid)
520
0
    return Loc;
521
522
  // Back up from the current location until we hit the beginning of a line
523
  // (or the buffer). We'll relex from that point.
524
10.5k
  const char *StrData = Buffer.data() + LocInfo.second;
525
10.5k
  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
526
10.5k
  if (!LexStart || 
LexStart == StrData10.5k
)
527
255
    return Loc;
528
529
  // Create a lexer starting at the beginning of this token.
530
10.2k
  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
531
10.2k
  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
532
10.2k
                 Buffer.end());
533
10.2k
  TheLexer.SetCommentRetentionState(true);
534
535
  // Lex tokens until we find the token that contains the source location.
536
10.2k
  Token TheTok;
537
16.6k
  do {
538
16.6k
    TheLexer.LexFromRawLexer(TheTok);
539
540
16.6k
    if (TheLexer.getBufferLocation() > StrData) {
541
      // Lexing this token has taken the lexer past the source location we're
542
      // looking for. If the current token encompasses our source location,
543
      // return the beginning of that token.
544
10.2k
      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
545
9.79k
        return TheTok.getLocation();
546
547
      // We ended up skipping over the source location entirely, which means
548
      // that it points into whitespace. We're done here.
549
492
      break;
550
492
    }
551
6.41k
  } while (TheTok.getKind() != tok::eof);
552
553
  // We've passed our source location; just return the original source location.
554
492
  return Loc;
555
10.2k
}
556
557
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
558
                                          const SourceManager &SM,
559
10.5k
                                          const LangOptions &LangOpts) {
560
10.5k
  if (Loc.isFileID())
561
10.5k
    return getBeginningOfFileToken(Loc, SM, LangOpts);
562
563
20
  if (!SM.isMacroArgExpansion(Loc))
564
0
    return Loc;
565
566
20
  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
567
20
  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
568
20
  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
569
20
  std::pair<FileID, unsigned> BeginFileLocInfo =
570
20
      SM.getDecomposedLoc(BeginFileLoc);
571
20
  assert(FileLocInfo.first == BeginFileLocInfo.first &&
572
20
         FileLocInfo.second >= BeginFileLocInfo.second);
573
20
  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
574
20
}
575
576
namespace {
577
578
enum PreambleDirectiveKind {
579
  PDK_Skipped,
580
  PDK_Unknown
581
};
582
583
} // namespace
584
585
PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
586
                                      const LangOptions &LangOpts,
587
512
                                      unsigned MaxLines) {
588
  // Create a lexer starting at the beginning of the file. Note that we use a
589
  // "fake" file source location at offset 1 so that the lexer will track our
590
  // position within the file.
591
512
  const unsigned StartOffset = 1;
592
512
  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
593
512
  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
594
512
                 Buffer.end());
595
512
  TheLexer.SetCommentRetentionState(true);
596
597
512
  bool InPreprocessorDirective = false;
598
512
  Token TheTok;
599
512
  SourceLocation ActiveCommentLoc;
600
601
512
  unsigned MaxLineOffset = 0;
602
512
  if (MaxLines) {
603
84
    const char *CurPtr = Buffer.begin();
604
84
    unsigned CurLine = 0;
605
12.3k
    while (CurPtr != Buffer.end()) {
606
12.3k
      char ch = *CurPtr++;
607
12.3k
      if (ch == '\n') {
608
642
        ++CurLine;
609
642
        if (CurLine == MaxLines)
610
83
          break;
611
642
      }
612
12.3k
    }
613
84
    if (CurPtr != Buffer.end())
614
78
      MaxLineOffset = CurPtr - Buffer.begin();
615
84
  }
616
617
3.90k
  do {
618
3.90k
    TheLexer.LexFromRawLexer(TheTok);
619
620
3.90k
    if (InPreprocessorDirective) {
621
      // If we've hit the end of the file, we're done.
622
2.82k
      if (TheTok.getKind() == tok::eof) {
623
14
        break;
624
14
      }
625
626
      // If we haven't hit the end of the preprocessor directive, skip this
627
      // token.
628
2.81k
      if (!TheTok.isAtStartOfLine())
629
1.89k
        continue;
630
631
      // We've passed the end of the preprocessor directive, and will look
632
      // at this token again below.
633
916
      InPreprocessorDirective = false;
634
916
    }
635
636
    // Keep track of the # of lines in the preamble.
637
1.99k
    if (TheTok.isAtStartOfLine()) {
638
1.97k
      unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
639
640
      // If we were asked to limit the number of lines in the preamble,
641
      // and we're about to exceed that limit, we're done.
642
1.97k
      if (MaxLineOffset && 
TokOffset >= MaxLineOffset360
)
643
18
        break;
644
1.97k
    }
645
646
    // Comments are okay; skip over them.
647
1.97k
    if (TheTok.getKind() == tok::comment) {
648
569
      if (ActiveCommentLoc.isInvalid())
649
223
        ActiveCommentLoc = TheTok.getLocation();
650
569
      continue;
651
569
    }
652
653
1.41k
    if (TheTok.isAtStartOfLine() && 
TheTok.getKind() == tok::hash1.39k
) {
654
      // This is the start of a preprocessor directive.
655
930
      Token HashTok = TheTok;
656
930
      InPreprocessorDirective = true;
657
930
      ActiveCommentLoc = SourceLocation();
658
659
      // Figure out which directive this is. Since we're lexing raw tokens,
660
      // we don't have an identifier table available. Instead, just look at
661
      // the raw identifier to recognize and categorize preprocessor directives.
662
930
      TheLexer.LexFromRawLexer(TheTok);
663
930
      if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
664
930
        StringRef Keyword = TheTok.getRawIdentifier();
665
930
        PreambleDirectiveKind PDK
666
930
          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
667
930
              .Case("include", PDK_Skipped)
668
930
              .Case("__include_macros", PDK_Skipped)
669
930
              .Case("define", PDK_Skipped)
670
930
              .Case("undef", PDK_Skipped)
671
930
              .Case("line", PDK_Skipped)
672
930
              .Case("error", PDK_Skipped)
673
930
              .Case("pragma", PDK_Skipped)
674
930
              .Case("import", PDK_Skipped)
675
930
              .Case("include_next", PDK_Skipped)
676
930
              .Case("warning", PDK_Skipped)
677
930
              .Case("ident", PDK_Skipped)
678
930
              .Case("sccs", PDK_Skipped)
679
930
              .Case("assert", PDK_Skipped)
680
930
              .Case("unassert", PDK_Skipped)
681
930
              .Case("if", PDK_Skipped)
682
930
              .Case("ifdef", PDK_Skipped)
683
930
              .Case("ifndef", PDK_Skipped)
684
930
              .Case("elif", PDK_Skipped)
685
930
              .Case("else", PDK_Skipped)
686
930
              .Case("endif", PDK_Skipped)
687
930
              .Default(PDK_Unknown);
688
689
930
        switch (PDK) {
690
930
        case PDK_Skipped:
691
930
          continue;
692
693
0
        case PDK_Unknown:
694
          // We don't know what this directive is; stop at the '#'.
695
0
          break;
696
0
        }
697
0
      }
698
699
      // We only end up here if we didn't recognize the preprocessor
700
      // directive or it was one that can't occur in the preamble at this
701
      // point. Roll back the current token to the location of the '#'.
702
0
      TheTok = HashTok;
703
0
    }
704
705
    // We hit a token that we don't recognize as being in the
706
    // "preprocessing only" part of the file, so we're no longer in
707
    // the preamble.
708
480
    break;
709
3.39k
  } while (true);
710
711
512
  SourceLocation End;
712
512
  if (ActiveCommentLoc.isValid())
713
80
    End = ActiveCommentLoc; // don't truncate a decl comment.
714
432
  else
715
432
    End = TheTok.getLocation();
716
717
512
  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
718
512
                        TheTok.isAtStartOfLine());
719
512
}
720
721
unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
722
                                     const SourceManager &SM,
723
73.0k
                                     const LangOptions &LangOpts) {
724
  // Figure out how many physical characters away the specified expansion
725
  // character is.  This needs to take into consideration newlines and
726
  // trigraphs.
727
73.0k
  bool Invalid = false;
728
73.0k
  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
729
730
  // If they request the first char of the token, we're trivially done.
731
73.0k
  if (Invalid || (CharNo == 0 && 
Lexer::isObviouslySimpleCharacter(*TokPtr)2.90k
))
732
2.90k
    return 0;
733
734
70.1k
  unsigned PhysOffset = 0;
735
736
  // The usual case is that tokens don't contain anything interesting.  Skip
737
  // over the uninteresting characters.  If a token only consists of simple
738
  // chars, this method is extremely fast.
739
776k
  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
740
774k
    if (CharNo == 0)
741
68.6k
      return PhysOffset;
742
706k
    ++TokPtr;
743
706k
    --CharNo;
744
706k
    ++PhysOffset;
745
706k
  }
746
747
  // If we have a character that may be a trigraph or escaped newline, use a
748
  // lexer to parse it correctly.
749
9.97k
  
for (; 1.53k
CharNo;
--CharNo8.44k
) {
750
8.44k
    unsigned Size;
751
8.44k
    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
752
8.44k
    TokPtr += Size;
753
8.44k
    PhysOffset += Size;
754
8.44k
  }
755
756
  // Final detail: if we end up on an escaped newline, we want to return the
757
  // location of the actual byte of the token.  For example foo\<newline>bar
758
  // advanced by 3 should return the location of b, not of \\.  One compounding
759
  // detail of this is that the escape may be made by a trigraph.
760
1.53k
  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
761
1.11k
    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
762
763
1.53k
  return PhysOffset;
764
70.1k
}
765
766
/// Computes the source location just past the end of the
767
/// token at this source location.
768
///
769
/// This routine can be used to produce a source location that
770
/// points just past the end of the token referenced by \p Loc, and
771
/// is generally used when a diagnostic needs to point just after a
772
/// token where it expected something different that it received. If
773
/// the returned source location would not be meaningful (e.g., if
774
/// it points into a macro), this routine returns an invalid
775
/// source location.
776
///
777
/// \param Offset an offset from the end of the token, where the source
778
/// location should refer to. The default offset (0) produces a source
779
/// location pointing just past the end of the token; an offset of 1 produces
780
/// a source location pointing to the last character in the token, etc.
781
SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
782
                                          const SourceManager &SM,
783
4.42M
                                          const LangOptions &LangOpts) {
784
4.42M
  if (Loc.isInvalid())
785
88
    return {};
786
787
4.42M
  if (Loc.isMacroID()) {
788
1.42k
    if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
789
1.22k
      return {}; // Points inside the macro expansion.
790
4.42M
  }
791
792
4.42M
  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
793
4.42M
  if (Len > Offset)
794
4.42M
    Len = Len - Offset;
795
360
  else
796
360
    return Loc;
797
798
4.42M
  return Loc.getLocWithOffset(Len);
799
4.42M
}
800
801
/// Returns true if the given MacroID location points at the first
802
/// token of the macro expansion.
803
bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
804
                                      const SourceManager &SM,
805
                                      const LangOptions &LangOpts,
806
41.7M
                                      SourceLocation *MacroBegin) {
807
41.7M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
808
809
41.7M
  SourceLocation expansionLoc;
810
41.7M
  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
811
12.4M
    return false;
812
813
29.2M
  if (expansionLoc.isFileID()) {
814
    // No other macro expansions, this is the first.
815
7.06M
    if (MacroBegin)
816
190
      *MacroBegin = expansionLoc;
817
7.06M
    return true;
818
7.06M
  }
819
820
22.1M
  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
821
22.1M
}
822
823
/// Returns true if the given MacroID location points at the last
824
/// token of the macro expansion.
825
bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
826
                                    const SourceManager &SM,
827
                                    const LangOptions &LangOpts,
828
35.9M
                                    SourceLocation *MacroEnd) {
829
35.9M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
830
831
35.9M
  SourceLocation spellLoc = SM.getSpellingLoc(loc);
832
35.9M
  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
833
35.9M
  if (tokLen == 0)
834
0
    return false;
835
836
35.9M
  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
837
35.9M
  SourceLocation expansionLoc;
838
35.9M
  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
839
5.17M
    return false;
840
841
30.7M
  if (expansionLoc.isFileID()) {
842
    // No other macro expansions.
843
14.3M
    if (MacroEnd)
844
289
      *MacroEnd = expansionLoc;
845
14.3M
    return true;
846
14.3M
  }
847
848
16.3M
  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
849
16.3M
}
850
851
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
852
                                             const SourceManager &SM,
853
4.31M
                                             const LangOptions &LangOpts) {
854
4.31M
  SourceLocation Begin = Range.getBegin();
855
4.31M
  SourceLocation End = Range.getEnd();
856
4.31M
  assert(Begin.isFileID() && End.isFileID());
857
4.31M
  if (Range.isTokenRange()) {
858
4.31M
    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
859
4.31M
    if (End.isInvalid())
860
0
      return {};
861
4.31M
  }
862
863
  // Break down the source locations.
864
4.31M
  FileID FID;
865
4.31M
  unsigned BeginOffs;
866
4.31M
  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
867
4.31M
  if (FID.isInvalid())
868
0
    return {};
869
870
4.31M
  unsigned EndOffs;
871
4.31M
  if (!SM.isInFileID(End, FID, &EndOffs) ||
872
4.31M
      BeginOffs > EndOffs)
873
0
    return {};
874
875
4.31M
  return CharSourceRange::getCharRange(Begin, End);
876
4.31M
}
877
878
CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
879
                                         const SourceManager &SM,
880
4.31M
                                         const LangOptions &LangOpts) {
881
4.31M
  SourceLocation Begin = Range.getBegin();
882
4.31M
  SourceLocation End = Range.getEnd();
883
4.31M
  if (Begin.isInvalid() || 
End.isInvalid()4.31M
)
884
6
    return {};
885
886
4.31M
  if (Begin.isFileID() && 
End.isFileID()4.31M
)
887
4.31M
    return makeRangeFromFileLocs(Range, SM, LangOpts);
888
889
286
  if (Begin.isMacroID() && 
End.isFileID()279
) {
890
67
    if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
891
1
      return {};
892
66
    Range.setBegin(Begin);
893
66
    return makeRangeFromFileLocs(Range, SM, LangOpts);
894
66
  }
895
896
219
  if (Begin.isFileID() && 
End.isMacroID()7
) {
897
7
    if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
898
7
                                                          &End)) ||
899
7
        (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
900
0
                                                           &End)))
901
0
      return {};
902
7
    Range.setEnd(End);
903
7
    return makeRangeFromFileLocs(Range, SM, LangOpts);
904
7
  }
905
906
212
  assert(Begin.isMacroID() && End.isMacroID());
907
212
  SourceLocation MacroBegin, MacroEnd;
908
212
  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
909
98
      ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
910
96
                                                        &MacroEnd)) ||
911
19
       (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
912
79
                                                         &MacroEnd)))) {
913
79
    Range.setBegin(MacroBegin);
914
79
    Range.setEnd(MacroEnd);
915
79
    return makeRangeFromFileLocs(Range, SM, LangOpts);
916
79
  }
917
918
133
  bool Invalid = false;
919
133
  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
920
133
                                                        &Invalid);
921
133
  if (Invalid)
922
0
    return {};
923
924
133
  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
925
113
    const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
926
113
                                                        &Invalid);
927
113
    if (Invalid)
928
0
      return {};
929
930
113
    if (EndEntry.getExpansion().isMacroArgExpansion() &&
931
112
        BeginEntry.getExpansion().getExpansionLocStart() ==
932
111
            EndEntry.getExpansion().getExpansionLocStart()) {
933
111
      Range.setBegin(SM.getImmediateSpellingLoc(Begin));
934
111
      Range.setEnd(SM.getImmediateSpellingLoc(End));
935
111
      return makeFileCharRange(Range, SM, LangOpts);
936
111
    }
937
22
  }
938
939
22
  return {};
940
22
}
941
942
StringRef Lexer::getSourceText(CharSourceRange Range,
943
                               const SourceManager &SM,
944
                               const LangOptions &LangOpts,
945
4.31M
                               bool *Invalid) {
946
4.31M
  Range = makeFileCharRange(Range, SM, LangOpts);
947
4.31M
  if (Range.isInvalid()) {
948
9
    if (Invalid) 
*Invalid = true2
;
949
9
    return {};
950
9
  }
951
952
  // Break down the source location.
953
4.31M
  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
954
4.31M
  if (beginInfo.first.isInvalid()) {
955
0
    if (Invalid) *Invalid = true;
956
0
    return {};
957
0
  }
958
959
4.31M
  unsigned EndOffs;
960
4.31M
  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
961
4.31M
      beginInfo.second > EndOffs) {
962
0
    if (Invalid) *Invalid = true;
963
0
    return {};
964
0
  }
965
966
  // Try to the load the file buffer.
967
4.31M
  bool invalidTemp = false;
968
4.31M
  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
969
4.31M
  if (invalidTemp) {
970
0
    if (Invalid) *Invalid = true;
971
0
    return {};
972
0
  }
973
974
4.31M
  if (Invalid) 
*Invalid = false251
;
975
4.31M
  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
976
4.31M
}
977
978
StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
979
                                       const SourceManager &SM,
980
6.25k
                                       const LangOptions &LangOpts) {
981
6.25k
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
982
983
  // Find the location of the immediate macro expansion.
984
20.5k
  while (true) {
985
20.5k
    FileID FID = SM.getFileID(Loc);
986
20.5k
    const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
987
20.5k
    const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
988
20.5k
    Loc = Expansion.getExpansionLocStart();
989
20.5k
    if (!Expansion.isMacroArgExpansion())
990
4.09k
      break;
991
992
    // For macro arguments we need to check that the argument did not come
993
    // from an inner macro, e.g: "MAC1( MAC2(foo) )"
994
995
    // Loc points to the argument id of the macro definition, move to the
996
    // macro expansion.
997
16.4k
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
998
16.4k
    SourceLocation SpellLoc = Expansion.getSpellingLoc();
999
16.4k
    if (SpellLoc.isFileID())
1000
2.16k
      break; // No inner macro.
1001
1002
    // If spelling location resides in the same FileID as macro expansion
1003
    // location, it means there is no inner macro.
1004
14.3k
    FileID MacroFID = SM.getFileID(Loc);
1005
14.3k
    if (SM.isInFileID(SpellLoc, MacroFID))
1006
1
      break;
1007
1008
    // Argument came from inner macro.
1009
14.3k
    Loc = SpellLoc;
1010
14.3k
  }
1011
1012
  // Find the spelling location of the start of the non-argument expansion
1013
  // range. This is where the macro name was spelled in order to begin
1014
  // expanding this macro.
1015
6.25k
  Loc = SM.getSpellingLoc(Loc);
1016
1017
  // Dig out the buffer where the macro name was spelled and the extents of the
1018
  // name so that we can render it into the expansion note.
1019
6.25k
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1020
6.25k
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1021
6.25k
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1022
6.25k
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1023
6.25k
}
1024
1025
StringRef Lexer::getImmediateMacroNameForDiagnostics(
1026
638
    SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1027
638
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1028
  // Walk past macro argument expansions.
1029
638
  while (SM.isMacroArgExpansion(Loc))
1030
0
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1031
1032
  // If the macro's spelling has no FileID, then it's actually a token paste
1033
  // or stringization (or similar) and not a macro at all.
1034
638
  if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1035
50
    return {};
1036
1037
  // Find the spelling location of the start of the non-argument expansion
1038
  // range. This is where the macro name was spelled in order to begin
1039
  // expanding this macro.
1040
588
  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1041
1042
  // Dig out the buffer where the macro name was spelled and the extents of the
1043
  // name so that we can render it into the expansion note.
1044
588
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1045
588
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1046
588
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1047
588
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1048
588
}
1049
1050
1.39k
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1051
1.39k
  return isIdentifierBody(c, LangOpts.DollarIdents);
1052
1.39k
}
1053
1054
10.4k
bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1055
10.4k
  assert(isVerticalWhitespace(Str[0]));
1056
10.4k
  if (Str - 1 < BufferStart)
1057
2
    return false;
1058
1059
10.3k
  if ((Str[0] == '\n' && 
Str[-1] == '\r'10.3k
) ||
1060
10.3k
      (Str[0] == '\r' && 
Str[-1] == '\n'18
)) {
1061
18
    if (Str - 2 < BufferStart)
1062
2
      return false;
1063
16
    --Str;
1064
16
  }
1065
10.3k
  --Str;
1066
1067
  // Rewind to first non-space character:
1068
10.5k
  while (Str > BufferStart && 
isHorizontalWhitespace(*Str)10.4k
)
1069
106
    --Str;
1070
1071
10.3k
  return *Str == '\\';
1072
10.3k
}
1073
1074
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1075
163
                                       const SourceManager &SM) {
1076
163
  if (Loc.isInvalid() || Loc.isMacroID())
1077
0
    return {};
1078
163
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1079
163
  if (LocInfo.first.isInvalid())
1080
0
    return {};
1081
163
  bool Invalid = false;
1082
163
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1083
163
  if (Invalid)
1084
0
    return {};
1085
163
  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1086
163
  if (!Line)
1087
0
    return {};
1088
163
  StringRef Rest = Buffer.substr(Line - Buffer.data());
1089
163
  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1090
163
  return NumWhitespaceChars == StringRef::npos
1091
0
             ? ""
1092
163
             : Rest.take_front(NumWhitespaceChars);
1093
163
}
1094
1095
//===----------------------------------------------------------------------===//
1096
// Diagnostics forwarding code.
1097
//===----------------------------------------------------------------------===//
1098
1099
/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1100
/// lexer buffer was all expanded at a single point, perform the mapping.
1101
/// This is currently only used for _Pragma implementation, so it is the slow
1102
/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1103
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1104
    Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1105
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1106
                                        SourceLocation FileLoc,
1107
2.60M
                                        unsigned CharNo, unsigned TokLen) {
1108
2.60M
  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1109
1110
  // Otherwise, we're lexing "mapped tokens".  This is used for things like
1111
  // _Pragma handling.  Combine the expansion location of FileLoc with the
1112
  // spelling location.
1113
2.60M
  SourceManager &SM = PP.getSourceManager();
1114
1115
  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1116
  // characters come from spelling(FileLoc)+Offset.
1117
2.60M
  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1118
2.60M
  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1119
1120
  // Figure out the expansion loc range, which is the range covered by the
1121
  // original _Pragma(...) sequence.
1122
2.60M
  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1123
1124
2.60M
  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1125
2.60M
}
1126
1127
/// getSourceLocation - Return a source location identifier for the specified
1128
/// offset in the current file.
1129
SourceLocation Lexer::getSourceLocation(const char *Loc,
1130
2.08G
                                        unsigned TokLen) const {
1131
2.08G
  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1132
2.08G
         "Location out of range for this buffer!");
1133
1134
  // In the normal case, we're just lexing from a simple file buffer, return
1135
  // the file id from FileLoc with the offset specified.
1136
2.08G
  unsigned CharNo = Loc-BufferStart;
1137
2.08G
  if (FileLoc.isFileID())
1138
2.08G
    return FileLoc.getLocWithOffset(CharNo);
1139
1140
  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1141
  // tokens are lexed from where the _Pragma was defined.
1142
2.60M
  assert(PP && "This doesn't work on raw lexers");
1143
2.60M
  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1144
2.60M
}
1145
1146
/// Diag - Forwarding function for diagnostics.  This translate a source
1147
/// position in the current buffer into a SourceLocation object for rendering.
1148
65.9k
DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1149
65.9k
  return PP->Diag(getSourceLocation(Loc), DiagID);
1150
65.9k
}
1151
1152
//===----------------------------------------------------------------------===//
1153
// Trigraph and Escaped Newline Handling Code.
1154
//===----------------------------------------------------------------------===//
1155
1156
/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1157
/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1158
256
static char GetTrigraphCharForLetter(char Letter) {
1159
256
  switch (Letter) {
1160
121
  default:   return 0;
1161
23
  case '=':  return '#';
1162
31
  case ')':  return ']';
1163
30
  case '(':  return '[';
1164
6
  case '!':  return '|';
1165
1
  case '\'': return '^';
1166
4
  case '>':  return '}';
1167
34
  case '/':  return '\\';
1168
3
  case '<':  return '{';
1169
3
  case '-':  return '~';
1170
256
  }
1171
256
}
1172
1173
/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1174
/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1175
/// return the result character.  Finally, emit a warning about trigraph use
1176
/// whether trigraphs are enabled or not.
1177
228
static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1178
228
  char Res = GetTrigraphCharForLetter(*CP);
1179
228
  if (!Res || 
!L107
)
return Res134
;
1180
1181
94
  if (!L->getLangOpts().Trigraphs) {
1182
37
    if (!L->isLexingRawMode())
1183
26
      L->Diag(CP-2, diag::trigraph_ignored);
1184
37
    return 0;
1185
37
  }
1186
1187
57
  if (!L->isLexingRawMode())
1188
46
    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1189
57
  return Res;
1190
57
}
1191
1192
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1193
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1194
/// trigraph equivalent on entry to this function.
1195
11.0M
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1196
11.0M
  unsigned Size = 0;
1197
11.0M
  while (isWhitespace(Ptr[Size])) {
1198
11.0M
    ++Size;
1199
1200
11.0M
    if (Ptr[Size-1] != '\n' && 
Ptr[Size-1] != '\r'324
)
1201
205
      continue;
1202
1203
    // If this is a \r\n or \n\r, skip the other half.
1204
11.0M
    if ((Ptr[Size] == '\r' || 
Ptr[Size] == '\n'11.0M
) &&
1205
152
        Ptr[Size-1] != Ptr[Size])
1206
119
      ++Size;
1207
1208
11.0M
    return Size;
1209
11.0M
  }
1210
1211
  // Not an escaped newline, must be a \t or something else.
1212
1.13k
  return 0;
1213
11.0M
}
1214
1215
/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1216
/// them), skip over them and return the first non-escaped-newline found,
1217
/// otherwise return P.
1218
1.11k
const char *Lexer::SkipEscapedNewLines(const char *P) {
1219
1.13k
  while (true) {
1220
1.13k
    const char *AfterEscape;
1221
1.13k
    if (*P == '\\') {
1222
1.11k
      AfterEscape = P+1;
1223
14
    } else if (*P == '?') {
1224
      // If not a trigraph for escape, bail out.
1225
1
      if (P[1] != '?' || 
P[2] != '/'0
)
1226
1
        return P;
1227
      // FIXME: Take LangOpts into account; the language might not
1228
      // support trigraphs.
1229
0
      AfterEscape = P+3;
1230
13
    } else {
1231
13
      return P;
1232
13
    }
1233
1234
1.11k
    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1235
1.11k
    if (NewLineSize == 0) 
return P1.10k
;
1236
13
    P = AfterEscape+NewLineSize;
1237
13
  }
1238
1.11k
}
1239
1240
Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1241
                                     const SourceManager &SM,
1242
497
                                     const LangOptions &LangOpts) {
1243
497
  if (Loc.isMacroID()) {
1244
2
    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1245
2
      return None;
1246
495
  }
1247
495
  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1248
1249
  // Break down the source location.
1250
495
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1251
1252
  // Try to load the file buffer.
1253
495
  bool InvalidTemp = false;
1254
495
  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1255
495
  if (InvalidTemp)
1256
0
    return None;
1257
1258
495
  const char *TokenBegin = File.data() + LocInfo.second;
1259
1260
  // Lex from the start of the given location.
1261
495
  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1262
495
                                      TokenBegin, File.end());
1263
  // Find the token.
1264
495
  Token Tok;
1265
495
  lexer.LexFromRawLexer(Tok);
1266
495
  return Tok;
1267
495
}
1268
1269
/// Checks that the given token is the first token that occurs after the
1270
/// given location (this excludes comments and whitespace). Returns the location
1271
/// immediately after the specified token. If the token is not found or the
1272
/// location is inside a macro, the returned source location will be invalid.
1273
SourceLocation Lexer::findLocationAfterToken(
1274
    SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1275
473
    const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1276
473
  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1277
473
  if (!Tok || 
Tok->isNot(TKind)471
)
1278
58
    return {};
1279
415
  SourceLocation TokenLoc = Tok->getLocation();
1280
1281
  // Calculate how much whitespace needs to be skipped if any.
1282
415
  unsigned NumWhitespaceChars = 0;
1283
415
  if (SkipTrailingWhitespaceAndNewLine) {
1284
235
    const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1285
235
    unsigned char C = *TokenEnd;
1286
463
    while (isHorizontalWhitespace(C)) {
1287
228
      C = *(++TokenEnd);
1288
228
      NumWhitespaceChars++;
1289
228
    }
1290
1291
    // Skip \r, \n, \r\n, or \n\r
1292
235
    if (C == '\n' || 
C == '\r'132
) {
1293
104
      char PrevC = C;
1294
104
      C = *(++TokenEnd);
1295
104
      NumWhitespaceChars++;
1296
104
      if ((C == '\n' || 
C == '\r'103
) &&
C != PrevC1
)
1297
1
        NumWhitespaceChars++;
1298
104
    }
1299
235
  }
1300
1301
415
  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1302
415
}
1303
1304
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1305
/// get its size, and return it.  This is tricky in several cases:
1306
///   1. If currently at the start of a trigraph, we warn about the trigraph,
1307
///      then either return the trigraph (skipping 3 chars) or the '?',
1308
///      depending on whether trigraphs are enabled or not.
1309
///   2. If this is an escaped newline (potentially with whitespace between
1310
///      the backslash and newline), implicitly skip the newline and return
1311
///      the char after it.
1312
///
1313
/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1314
/// know that we can accumulate into Size, and that we have already incremented
1315
/// Ptr by Size bytes.
1316
///
1317
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1318
/// be updated to match.
1319
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1320
23.1M
                               Token *Tok) {
1321
  // If we have a slash, look for an escaped newline.
1322
23.1M
  if (Ptr[0] == '\\') {
1323
11.8M
    ++Size;
1324
11.8M
    ++Ptr;
1325
11.8M
Slash:
1326
    // Common case, backslash-char where the char is not whitespace.
1327
11.8M
    if (!isWhitespace(Ptr[0])) 
return '\\'719k
;
1328
1329
    // See if we have optional whitespace characters between the slash and
1330
    // newline.
1331
11.0M
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1332
      // Remember that this token needs to be cleaned.
1333
11.0M
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)11.0M
;
1334
1335
      // Warn if there was whitespace between the backslash and newline.
1336
11.0M
      if (Ptr[0] != '\n' && 
Ptr[0] != '\r'140
&&
Tok23
&&
!isLexingRawMode()16
)
1337
8
        Diag(Ptr, diag::backslash_newline_space);
1338
1339
      // Found backslash<whitespace><newline>.  Parse the char after it.
1340
11.0M
      Size += EscapedNewLineSize;
1341
11.0M
      Ptr  += EscapedNewLineSize;
1342
1343
      // Use slow version to accumulate a correct size field.
1344
11.0M
      return getCharAndSizeSlow(Ptr, Size, Tok);
1345
11.0M
    }
1346
1347
    // Otherwise, this is not an escaped newline, just return the slash.
1348
29
    return '\\';
1349
29
  }
1350
1351
  // If this is a trigraph, process it.
1352
11.3M
  if (Ptr[0] == '?' && 
Ptr[1] == '?'304k
) {
1353
    // If this is actually a legal trigraph (not something like "??x"), emit
1354
    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1355
228
    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1356
      // Remember that this token needs to be cleaned.
1357
70
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)57
;
1358
1359
70
      Ptr += 3;
1360
70
      Size += 3;
1361
70
      if (C == '\\') 
goto Slash18
;
1362
52
      return C;
1363
52
    }
1364
228
  }
1365
1366
  // If this is neither, return a single character.
1367
11.3M
  ++Size;
1368
11.3M
  return *Ptr;
1369
11.3M
}
1370
1371
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1372
/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1373
/// and that we have already incremented Ptr by Size bytes.
1374
///
1375
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1376
/// be updated to match.
1377
char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1378
22.6k
                                     const LangOptions &LangOpts) {
1379
  // If we have a slash, look for an escaped newline.
1380
22.6k
  if (Ptr[0] == '\\') {
1381
12.0k
    ++Size;
1382
12.0k
    ++Ptr;
1383
12.0k
Slash:
1384
    // Common case, backslash-char where the char is not whitespace.
1385
12.0k
    if (!isWhitespace(Ptr[0])) 
return '\\'1.50k
;
1386
1387
    // See if we have optional whitespace characters followed by a newline.
1388
10.5k
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1389
      // Found backslash<whitespace><newline>.  Parse the char after it.
1390
10.5k
      Size += EscapedNewLineSize;
1391
10.5k
      Ptr  += EscapedNewLineSize;
1392
1393
      // Use slow version to accumulate a correct size field.
1394
10.5k
      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1395
10.5k
    }
1396
1397
    // Otherwise, this is not an escaped newline, just return the slash.
1398
0
    return '\\';
1399
0
  }
1400
1401
  // If this is a trigraph, process it.
1402
10.6k
  if (LangOpts.Trigraphs && 
Ptr[0] == '?'4.92k
&&
Ptr[1] == '?'28
) {
1403
    // If this is actually a legal trigraph (not something like "??x"), return
1404
    // it.
1405
28
    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1406
28
      Ptr += 3;
1407
28
      Size += 3;
1408
28
      if (C == '\\') 
goto Slash8
;
1409
20
      return C;
1410
20
    }
1411
28
  }
1412
1413
  // If this is neither, return a single character.
1414
10.5k
  ++Size;
1415
10.5k
  return *Ptr;
1416
10.5k
}
1417
1418
//===----------------------------------------------------------------------===//
1419
// Helper methods for lexing.
1420
//===----------------------------------------------------------------------===//
1421
1422
/// Routine that indiscriminately sets the offset into the source file.
1423
418
void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1424
418
  BufferPtr = BufferStart + Offset;
1425
418
  if (BufferPtr > BufferEnd)
1426
0
    BufferPtr = BufferEnd;
1427
  // FIXME: What exactly does the StartOfLine bit mean?  There are two
1428
  // possible meanings for the "start" of the line: the first token on the
1429
  // unexpanded line, or the first token on the expanded line.
1430
418
  IsAtStartOfLine = StartOfLine;
1431
418
  IsAtPhysicalStartOfLine = StartOfLine;
1432
418
}
1433
1434
695
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1435
695
  if (LangOpts.AsmPreprocessor) {
1436
4
    return false;
1437
691
  } else if (LangOpts.DollarIdents && '$' == C) {
1438
3
    return true;
1439
688
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C11425
) {
1440
517
    static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1441
517
        C11AllowedIDCharRanges);
1442
517
    return C11AllowedIDChars.contains(C);
1443
171
  } else if (LangOpts.CPlusPlus) {
1444
53
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1445
53
        CXX03AllowedIDCharRanges);
1446
53
    return CXX03AllowedIDChars.contains(C);
1447
118
  } else {
1448
118
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1449
118
        C99AllowedIDCharRanges);
1450
118
    return C99AllowedIDChars.contains(C);
1451
118
  }
1452
695
}
1453
1454
125
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1455
125
  assert(isAllowedIDChar(C, LangOpts));
1456
125
  if (LangOpts.AsmPreprocessor) {
1457
0
    return false;
1458
125
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C1175
) {
1459
99
    static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1460
99
        C11DisallowedInitialIDCharRanges);
1461
99
    return !C11DisallowedInitialIDChars.contains(C);
1462
26
  } else if (LangOpts.CPlusPlus) {
1463
6
    return true;
1464
20
  } else {
1465
20
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1466
20
        C99DisallowedInitialIDCharRanges);
1467
20
    return !C99DisallowedInitialIDChars.contains(C);
1468
20
  }
1469
125
}
1470
1471
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1472
625
                                            const char *End) {
1473
625
  return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1474
625
                                       L.getSourceLocation(End));
1475
625
}
1476
1477
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1478
357
                                      CharSourceRange Range, bool IsFirst) {
1479
  // Check C99 compatibility.
1480
357
  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1481
12
    enum {
1482
12
      CannotAppearInIdentifier = 0,
1483
12
      CannotStartIdentifier
1484
12
    };
1485
1486
12
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1487
12
        C99AllowedIDCharRanges);
1488
12
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1489
12
        C99DisallowedInitialIDCharRanges);
1490
12
    if (!C99AllowedIDChars.contains(C)) {
1491
5
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1492
5
        << Range
1493
5
        << CannotAppearInIdentifier;
1494
7
    } else if (IsFirst && 
C99DisallowedInitialIDChars.contains(C)3
) {
1495
2
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1496
2
        << Range
1497
2
        << CannotStartIdentifier;
1498
2
    }
1499
12
  }
1500
1501
  // Check C++98 compatibility.
1502
357
  if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1503
12
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1504
12
        CXX03AllowedIDCharRanges);
1505
12
    if (!CXX03AllowedIDChars.contains(C)) {
1506
5
      Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1507
5
        << Range;
1508
5
    }
1509
12
  }
1510
357
}
1511
1512
/// After encountering UTF-8 character C and interpreting it as an identifier
1513
/// character, check whether it's a homoglyph for a common non-identifier
1514
/// source character that is unlikely to be an intentional identifier
1515
/// character and warn if so.
1516
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1517
237
                                       CharSourceRange Range) {
1518
  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1519
237
  struct HomoglyphPair {
1520
237
    uint32_t Character;
1521
237
    char LooksLike;
1522
1.39k
    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1523
237
  };
1524
237
  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1525
237
    {U'\u00ad', 0},   // SOFT HYPHEN
1526
237
    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1527
237
    {U'\u037e', ';'}, // GREEK QUESTION MARK
1528
237
    {U'\u200b', 0},   // ZERO WIDTH SPACE
1529
237
    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1530
237
    {U'\u200d', 0},   // ZERO WIDTH JOINER
1531
237
    {U'\u2060', 0},   // WORD JOINER
1532
237
    {U'\u2061', 0},   // FUNCTION APPLICATION
1533
237
    {U'\u2062', 0},   // INVISIBLE TIMES
1534
237
    {U'\u2063', 0},   // INVISIBLE SEPARATOR
1535
237
    {U'\u2064', 0},   // INVISIBLE PLUS
1536
237
    {U'\u2212', '-'}, // MINUS SIGN
1537
237
    {U'\u2215', '/'}, // DIVISION SLASH
1538
237
    {U'\u2216', '\\'}, // SET MINUS
1539
237
    {U'\u2217', '*'}, // ASTERISK OPERATOR
1540
237
    {U'\u2223', '|'}, // DIVIDES
1541
237
    {U'\u2227', '^'}, // LOGICAL AND
1542
237
    {U'\u2236', ':'}, // RATIO
1543
237
    {U'\u223c', '~'}, // TILDE OPERATOR
1544
237
    {U'\ua789', ':'}, // MODIFIER LETTER COLON
1545
237
    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1546
237
    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1547
237
    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1548
237
    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1549
237
    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1550
237
    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1551
237
    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1552
237
    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1553
237
    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1554
237
    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1555
237
    {U'\uff0c', ','}, // FULLWIDTH COMMA
1556
237
    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1557
237
    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1558
237
    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1559
237
    {U'\uff1a', ':'}, // FULLWIDTH COLON
1560
237
    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1561
237
    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1562
237
    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1563
237
    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1564
237
    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1565
237
    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1566
237
    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1567
237
    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1568
237
    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1569
237
    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1570
237
    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1571
237
    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1572
237
    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1573
237
    {U'\uff5e', '~'}, // FULLWIDTH TILDE
1574
237
    {0, 0}
1575
237
  };
1576
237
  auto Homoglyph =
1577
237
      std::lower_bound(std::begin(SortedHomoglyphs),
1578
237
                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1579
237
  if (Homoglyph->Character == C) {
1580
59
    llvm::SmallString<5> CharBuf;
1581
59
    {
1582
59
      llvm::raw_svector_ostream CharOS(CharBuf);
1583
59
      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1584
59
    }
1585
59
    if (Homoglyph->LooksLike) {
1586
48
      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1587
48
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1588
48
          << Range << CharBuf << LooksLikeStr;
1589
11
    } else {
1590
11
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1591
11
          << Range << CharBuf;
1592
11
    }
1593
59
  }
1594
237
}
1595
1596
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1597
177
                                    Token &Result) {
1598
177
  const char *UCNPtr = CurPtr + Size;
1599
177
  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1600
177
  if (CodePoint == 0 || 
!isAllowedIDChar(CodePoint, LangOpts)146
)
1601
57
    return false;
1602
1603
120
  if (!isLexingRawMode())
1604
120
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1605
120
                              makeCharRange(*this, CurPtr, UCNPtr),
1606
120
                              /*IsFirst=*/false);
1607
1608
120
  Result.setFlag(Token::HasUCN);
1609
120
  if ((UCNPtr - CurPtr ==  6 && 
CurPtr[1] == 'u'94
) ||
1610
26
      (UCNPtr - CurPtr == 10 && 
CurPtr[1] == 'U'17
))
1611
105
    CurPtr = UCNPtr;
1612
15
  else
1613
105
    
while (15
CurPtr != UCNPtr)
1614
90
      (void)getAndAdvanceChar(CurPtr, Result);
1615
120
  return true;
1616
120
}
1617
1618
226
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1619
226
  const char *UnicodePtr = CurPtr;
1620
226
  llvm::UTF32 CodePoint;
1621
226
  llvm::ConversionResult Result =
1622
226
      llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1623
226
                                (const llvm::UTF8 *)BufferEnd,
1624
226
                                &CodePoint,
1625
226
                                llvm::strictConversion);
1626
226
  if (Result != llvm::conversionOK ||
1627
211
      !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
1628
45
    return false;
1629
1630
181
  if (!isLexingRawMode()) {
1631
155
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1632
155
                              makeCharRange(*this, CurPtr, UnicodePtr),
1633
155
                              /*IsFirst=*/false);
1634
155
    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1635
155
                               makeCharRange(*this, CurPtr, UnicodePtr));
1636
155
  }
1637
1638
181
  CurPtr = UnicodePtr;
1639
181
  return true;
1640
181
}
1641
1642
832M
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1643
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1644
832M
  unsigned Size;
1645
832M
  unsigned char C = *CurPtr++;
1646
9.19G
  while (isIdentifierBody(C))
1647
8.36G
    C = *CurPtr++;
1648
1649
832M
  --CurPtr;   // Back up over the skipped character.
1650
1651
  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1652
  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1653
  //
1654
  // TODO: Could merge these checks into an InfoTable flag to make the
1655
  // comparison cheaper
1656
832M
  if (
isASCII(C)832M
&& C != '\\' &&
C != '?'832M
&&
1657
832M
      (C != '$' || 
!LangOpts.DollarIdents2.55k
)) {
1658
832M
FinishIdentifier:
1659
832M
    const char *IdStart = BufferPtr;
1660
832M
    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1661
832M
    Result.setRawIdentifierData(IdStart);
1662
1663
    // If we are in raw mode, return this identifier raw.  There is no need to
1664
    // look up identifier information or attempt to macro expand it.
1665
832M
    if (LexingRawMode)
1666
335M
      return true;
1667
1668
    // Fill in Result.IdentifierInfo and update the token kind,
1669
    // looking up the identifier in the identifier table.
1670
496M
    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1671
    // Note that we have to call PP->LookUpIdentifierInfo() even for code
1672
    // completion, it writes IdentifierInfo into Result, and callers rely on it.
1673
1674
    // If the completion point is at the end of an identifier, we want to treat
1675
    // the identifier as incomplete even if it resolves to a macro or a keyword.
1676
    // This allows e.g. 'class^' to complete to 'classifier'.
1677
496M
    if (isCodeCompletionPoint(CurPtr)) {
1678
      // Return the code-completion token.
1679
85
      Result.setKind(tok::code_completion);
1680
      // Skip the code-completion char and all immediate identifier characters.
1681
      // This ensures we get consistent behavior when completing at any point in
1682
      // an identifier (i.e. at the start, in the middle, at the end). Note that
1683
      // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1684
      // simpler.
1685
85
      assert(*CurPtr == 0 && "Completion character must be 0");
1686
85
      ++CurPtr;
1687
      // Note that code completion token is not added as a separate character
1688
      // when the completion point is at the end of the buffer. Therefore, we need
1689
      // to check if the buffer has ended.
1690
85
      if (CurPtr < BufferEnd) {
1691
131
        while (isIdentifierBody(*CurPtr))
1692
47
          ++CurPtr;
1693
84
      }
1694
85
      BufferPtr = CurPtr;
1695
85
      return true;
1696
85
    }
1697
1698
    // Finally, now that we know we have an identifier, pass this off to the
1699
    // preprocessor, which may macro expand it or something.
1700
496M
    if (II->isHandleIdentifierCase())
1701
43.5M
      return PP->HandleIdentifier(Result);
1702
1703
452M
    return true;
1704
452M
  }
1705
1706
  // Otherwise, $,\,? in identifier found.  Enter slower path.
1707
1708
981
  C = getCharAndSize(CurPtr, Size);
1709
10.8k
  while (
true7.22k
) {
1710
10.8k
    if (C == '$') {
1711
      // If we hit a $ and they are not supported in identifiers, we are done.
1712
2.93k
      if (!LangOpts.DollarIdents) 
goto FinishIdentifier0
;
1713
1714
      // Otherwise, emit a diagnostic and continue.
1715
2.93k
      if (!isLexingRawMode())
1716
2.93k
        Diag(CurPtr, diag::ext_dollar_in_identifier);
1717
2.93k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1718
2.93k
      C = getCharAndSize(CurPtr, Size);
1719
2.93k
      continue;
1720
7.92k
    } else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)147
) {
1721
98
      C = getCharAndSize(CurPtr, Size);
1722
98
      continue;
1723
7.82k
    } else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)195
) {
1724
163
      C = getCharAndSize(CurPtr, Size);
1725
163
      continue;
1726
7.66k
    } else if (!isIdentifierBody(C)) {
1727
4.61k
      goto FinishIdentifier;
1728
4.61k
    }
1729
1730
    // Otherwise, this character is good, consume it.
1731
3.04k
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1732
1733
3.04k
    C = getCharAndSize(CurPtr, Size);
1734
33.7k
    while (isIdentifierBody(C)) {
1735
30.6k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1736
30.6k
      C = getCharAndSize(CurPtr, Size);
1737
30.6k
    }
1738
3.04k
  }
1739
981
}
1740
1741
/// isHexaLiteral - Return true if Start points to a hex constant.
1742
/// in microsoft mode (where this is supposed to be several different tokens).
1743
125k
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1744
125k
  unsigned Size;
1745
125k
  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1746
125k
  if (C1 != '0')
1747
123k
    return false;
1748
2.16k
  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1749
2.16k
  return (C2 == 'x' || 
C2 == 'X'15
);
1750
2.16k
}
1751
1752
/// LexNumericConstant - Lex the remainder of a integer or floating point
1753
/// constant. From[-1] is the first character lexed.  Return the end of the
1754
/// constant.
1755
84.2M
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1756
84.2M
  unsigned Size;
1757
84.2M
  char C = getCharAndSize(CurPtr, Size);
1758
84.2M
  char PrevCh = 0;
1759
284M
  while (isPreprocessingNumberBody(C)) {
1760
199M
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1761
199M
    PrevCh = C;
1762
199M
    C = getCharAndSize(CurPtr, Size);
1763
199M
  }
1764
1765
  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1766
84.2M
  if ((C == '-' || 
C == '+'83.5M
) &&
(1.08M
PrevCh == 'E'1.08M
||
PrevCh == 'e'1.08M
)) {
1767
    // If we are in Microsoft mode, don't continue if the constant is hex.
1768
    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1769
953k
    if (!LangOpts.MicrosoftExt || 
!isHexaLiteral(BufferPtr, LangOpts)123k
)
1770
953k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1771
83.3M
  }
1772
1773
  // If we have a hex FP constant, continue.
1774
83.3M
  if ((C == '-' || 
C == '+'83.3M
) &&
(134k
PrevCh == 'P'134k
||
PrevCh == 'p'134k
)) {
1775
    // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1776
    // not-quite-conforming extension. Only do so if this looks like it's
1777
    // actually meant to be a hexfloat, and not if it has a ud-suffix.
1778
10.1k
    bool IsHexFloat = true;
1779
10.1k
    if (!LangOpts.C99) {
1780
2.14k
      if (!isHexaLiteral(BufferPtr, LangOpts))
1781
9
        IsHexFloat = false;
1782
2.13k
      else if (!getLangOpts().CPlusPlus17 &&
1783
1.99k
               std::find(BufferPtr, CurPtr, '_') != CurPtr)
1784
3
        IsHexFloat = false;
1785
2.14k
    }
1786
10.1k
    if (IsHexFloat)
1787
10.1k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1788
83.3M
  }
1789
1790
  // If we have a digit separator, continue.
1791
83.3M
  if (C == '\'' && 
getLangOpts().CPlusPlus14335
) {
1792
319
    unsigned NextSize;
1793
319
    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1794
319
    if (isIdentifierBody(Next)) {
1795
309
      if (!isLexingRawMode())
1796
265
        Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1797
309
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1798
309
      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1799
309
      return LexNumericConstant(Result, CurPtr);
1800
309
    }
1801
83.3M
  }
1802
1803
  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1804
83.3M
  if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)5
)
1805
4
    return LexNumericConstant(Result, CurPtr);
1806
83.3M
  if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)15
)
1807
4
    return LexNumericConstant(Result, CurPtr);
1808
1809
  // Update the location of token as well as BufferPtr.
1810
83.3M
  const char *TokStart = BufferPtr;
1811
83.3M
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1812
83.3M
  Result.setLiteralData(TokStart);
1813
83.3M
  return true;
1814
83.3M
}
1815
1816
/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1817
/// in C++11, or warn on a ud-suffix in C++98.
1818
const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1819
9.38M
                               bool IsStringLiteral) {
1820
9.38M
  assert(getLangOpts().CPlusPlus);
1821
1822
  // Maximally munch an identifier.
1823
9.38M
  unsigned Size;
1824
9.38M
  char C = getCharAndSize(CurPtr, Size);
1825
9.38M
  bool Consumed = false;
1826
1827
9.38M
  if (!isIdentifierHead(C)) {
1828
9.37M
    if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)7
)
1829
0
      Consumed = true;
1830
9.37M
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)0
)
1831
0
      Consumed = true;
1832
9.37M
    else
1833
9.37M
      return CurPtr;
1834
11.7k
  }
1835
1836
11.7k
  if (!getLangOpts().CPlusPlus11) {
1837
17
    if (!isLexingRawMode())
1838
8
      Diag(CurPtr,
1839
3
           C == '_' ? diag::warn_cxx11_compat_user_defined_literal
1840
5
                    : diag::warn_cxx11_compat_reserved_user_defined_literal)
1841
8
        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1842
17
    return CurPtr;
1843
17
  }
1844
1845
  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1846
  // that does not start with an underscore is ill-formed. As a conforming
1847
  // extension, we treat all such suffixes as if they had whitespace before
1848
  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1849
  // likely to be a ud-suffix than a macro, however, and accept that.
1850
11.7k
  
if (11.7k
!Consumed11.7k
) {
1851
11.7k
    bool IsUDSuffix = false;
1852
11.7k
    if (C == '_')
1853
268
      IsUDSuffix = true;
1854
11.4k
    else if (IsStringLiteral && 
getLangOpts().CPlusPlus1411.4k
) {
1855
      // In C++1y, we need to look ahead a few characters to see if this is a
1856
      // valid suffix for a string literal or a numeric literal (this could be
1857
      // the 'operator""if' defining a numeric literal operator).
1858
565
      const unsigned MaxStandardSuffixLength = 3;
1859
565
      char Buffer[MaxStandardSuffixLength] = { C };
1860
565
      unsigned Consumed = Size;
1861
565
      unsigned Chars = 1;
1862
971
      while (true) {
1863
971
        unsigned NextSize;
1864
971
        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1865
971
                                         getLangOpts());
1866
971
        if (!isIdentifierBody(Next)) {
1867
          // End of suffix. Check whether this is on the allowed list.
1868
561
          const StringRef CompleteSuffix(Buffer, Chars);
1869
561
          IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1870
561
                                                            CompleteSuffix);
1871
561
          break;
1872
561
        }
1873
1874
410
        if (Chars == MaxStandardSuffixLength)
1875
          // Too long: can't be a standard suffix.
1876
4
          break;
1877
1878
406
        Buffer[Chars++] = Next;
1879
406
        Consumed += NextSize;
1880
406
      }
1881
565
    }
1882
1883
11.7k
    if (!IsUDSuffix) {
1884
10.9k
      if (!isLexingRawMode())
1885
9
        Diag(CurPtr, getLangOpts().MSVCCompat
1886
0
                         ? diag::ext_ms_reserved_user_defined_literal
1887
9
                         : diag::ext_reserved_user_defined_literal)
1888
9
          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1889
10.9k
      return CurPtr;
1890
10.9k
    }
1891
1892
761
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1893
761
  }
1894
1895
746
  Result.setFlag(Token::HasUDSuffix);
1896
1.83k
  while (
true1.82k
) {
1897
1.83k
    C = getCharAndSize(CurPtr, Size);
1898
1.83k
    if (isIdentifierBody(C)) 
{ CurPtr = ConsumeChar(CurPtr, Size, Result); }1.04k
1899
793
    else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)18
)
{}18
1900
775
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)16
)
{}14
1901
761
    else break;
1902
1.83k
  }
1903
1904
746
  return CurPtr;
1905
11.7k
}
1906
1907
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1908
/// either " or L" or u8" or u" or U".
1909
bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1910
14.0M
                             tok::TokenKind Kind) {
1911
14.0M
  const char *AfterQuote = CurPtr;
1912
  // Does this string contain the \0 character?
1913
14.0M
  const char *NulCharacter = nullptr;
1914
1915
14.0M
  if (!isLexingRawMode() &&
1916
10.1M
      (Kind == tok::utf8_string_literal ||
1917
10.1M
       Kind == tok::utf16_string_literal ||
1918
10.1M
       Kind == tok::utf32_string_literal))
1919
642
    Diag(BufferPtr, getLangOpts().CPlusPlus
1920
566
           ? diag::warn_cxx98_compat_unicode_literal
1921
76
           : diag::warn_c99_compat_unicode_literal);
1922
1923
14.0M
  char C = getAndAdvanceChar(CurPtr, Result);
1924
187M
  while (C != '"') {
1925
    // Skip escaped characters.  Escaped newlines will already be processed by
1926
    // getAndAdvanceChar.
1927
173M
    if (C == '\\')
1928
699k
      C = getAndAdvanceChar(CurPtr, Result);
1929
1930
173M
    if (C == '\n' || 
C == '\r'173M
|| // Newline.
1931
173M
        (C == 0 && 
CurPtr-1 == BufferEnd43
)) { // End of file.
1932
68
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor7
)
1933
5
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1934
68
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1935
68
      return true;
1936
68
    }
1937
1938
173M
    if (C == 0) {
1939
21
      if (isCodeCompletionPoint(CurPtr-1)) {
1940
12
        if (ParsingFilename)
1941
5
          codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1942
7
        else
1943
7
          PP->CodeCompleteNaturalLanguage();
1944
12
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1945
12
        cutOffLexing();
1946
12
        return true;
1947
12
      }
1948
1949
9
      NulCharacter = CurPtr-1;
1950
9
    }
1951
173M
    C = getAndAdvanceChar(CurPtr, Result);
1952
173M
  }
1953
1954
  // If we are in C++11, lex the optional ud-suffix.
1955
14.0M
  if (getLangOpts().CPlusPlus)
1956
9.13M
    CurPtr = LexUDSuffix(Result, CurPtr, true);
1957
1958
  // If a nul character existed in the string, warn about it.
1959
14.0M
  if (NulCharacter && 
!isLexingRawMode()8
)
1960
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1961
1962
  // Update the location of the token as well as the BufferPtr instance var.
1963
14.0M
  const char *TokStart = BufferPtr;
1964
14.0M
  FormTokenWithChars(Result, CurPtr, Kind);
1965
14.0M
  Result.setLiteralData(TokStart);
1966
14.0M
  return true;
1967
14.0M
}
1968
1969
/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1970
/// having lexed R", LR", u8R", uR", or UR".
1971
bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1972
481
                                tok::TokenKind Kind) {
1973
  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1974
  //  Between the initial and final double quote characters of the raw string,
1975
  //  any transformations performed in phases 1 and 2 (trigraphs,
1976
  //  universal-character-names, and line splicing) are reverted.
1977
1978
481
  if (!isLexingRawMode())
1979
112
    Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1980
1981
481
  unsigned PrefixLen = 0;
1982
1983
1.46k
  while (PrefixLen != 16 && 
isRawStringDelimBody(CurPtr[PrefixLen])1.45k
)
1984
979
    ++PrefixLen;
1985
1986
  // If the last character was not a '(', then we didn't lex a valid delimiter.
1987
481
  if (CurPtr[PrefixLen] != '(') {
1988
1
    if (!isLexingRawMode()) {
1989
1
      const char *PrefixEnd = &CurPtr[PrefixLen];
1990
1
      if (PrefixLen == 16) {
1991
1
        Diag(PrefixEnd, diag::err_raw_delim_too_long);
1992
0
      } else {
1993
0
        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1994
0
          << StringRef(PrefixEnd, 1);
1995
0
      }
1996
1
    }
1997
1998
    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1999
    // it's possible the '"' was intended to be part of the raw string, but
2000
    // there's not much we can do about that.
2001
59
    while (true) {
2002
59
      char C = *CurPtr++;
2003
2004
59
      if (C == '"')
2005
1
        break;
2006
58
      if (C == 0 && 
CurPtr-1 == BufferEnd0
) {
2007
0
        --CurPtr;
2008
0
        break;
2009
0
      }
2010
58
    }
2011
2012
1
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2013
1
    return true;
2014
1
  }
2015
2016
  // Save prefix and move CurPtr past it
2017
480
  const char *Prefix = CurPtr;
2018
480
  CurPtr += PrefixLen + 1; // skip over prefix and '('
2019
2020
8.55k
  while (true) {
2021
8.55k
    char C = *CurPtr++;
2022
2023
8.55k
    if (C == ')') {
2024
      // Check for prefix match and closing quote.
2025
484
      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && 
CurPtr[PrefixLen] == '"'476
) {
2026
476
        CurPtr += PrefixLen + 1; // skip over prefix and '"'
2027
476
        break;
2028
476
      }
2029
8.07k
    } else if (C == 0 && 
CurPtr-1 == BufferEnd4
) { // End of file.
2030
4
      if (!isLexingRawMode())
2031
1
        Diag(BufferPtr, diag::err_unterminated_raw_string)
2032
1
          << StringRef(Prefix, PrefixLen);
2033
4
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2034
4
      return true;
2035
4
    }
2036
8.55k
  }
2037
2038
  // If we are in C++11, lex the optional ud-suffix.
2039
476
  if (getLangOpts().CPlusPlus)
2040
476
    CurPtr = LexUDSuffix(Result, CurPtr, true);
2041
2042
  // Update the location of token as well as BufferPtr.
2043
476
  const char *TokStart = BufferPtr;
2044
476
  FormTokenWithChars(Result, CurPtr, Kind);
2045
476
  Result.setLiteralData(TokStart);
2046
476
  return true;
2047
480
}
2048
2049
/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2050
/// after having lexed the '<' character.  This is used for #include filenames.
2051
2.27M
bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2052
  // Does this string contain the \0 character?
2053
2.27M
  const char *NulCharacter = nullptr;
2054
2.27M
  const char *AfterLessPos = CurPtr;
2055
2.27M
  char C = getAndAdvanceChar(CurPtr, Result);
2056
44.2M
  while (C != '>') {
2057
    // Skip escaped characters.  Escaped newlines will already be processed by
2058
    // getAndAdvanceChar.
2059
41.9M
    if (C == '\\')
2060
6
      C = getAndAdvanceChar(CurPtr, Result);
2061
2062
41.9M
    if (C == '\n' || 
C == '\r'41.9M
|| // Newline.
2063
41.9M
        (C == 0 && 
(CurPtr - 1 == BufferEnd)10
)) { // End of file.
2064
      // If the filename is unterminated, then it must just be a lone <
2065
      // character.  Return this as such.
2066
10
      FormTokenWithChars(Result, AfterLessPos, tok::less);
2067
10
      return true;
2068
10
    }
2069
2070
41.9M
    if (C == 0) {
2071
7
      if (isCodeCompletionPoint(CurPtr - 1)) {
2072
6
        codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2073
6
        cutOffLexing();
2074
6
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2075
6
        return true;
2076
6
      }
2077
1
      NulCharacter = CurPtr-1;
2078
1
    }
2079
41.9M
    C = getAndAdvanceChar(CurPtr, Result);
2080
41.9M
  }
2081
2082
  // If a nul character existed in the string, warn about it.
2083
2.27M
  if (NulCharacter && 
!isLexingRawMode()1
)
2084
1
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2085
2086
  // Update the location of token as well as BufferPtr.
2087
2.27M
  const char *TokStart = BufferPtr;
2088
2.27M
  FormTokenWithChars(Result, CurPtr, tok::header_name);
2089
2.27M
  Result.setLiteralData(TokStart);
2090
2.27M
  return true;
2091
2.27M
}
2092
2093
void Lexer::codeCompleteIncludedFile(const char *PathStart,
2094
                                     const char *CompletionPoint,
2095
11
                                     bool IsAngled) {
2096
  // Completion only applies to the filename, after the last slash.
2097
11
  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2098
10
  llvm::StringRef SlashChars = LangOpts.MSVCCompat ? 
"/\\"1
: "/";
2099
11
  auto Slash = PartialPath.find_last_of(SlashChars);
2100
11
  StringRef Dir =
2101
7
      (Slash == StringRef::npos) ? "" : 
PartialPath.take_front(Slash)4
;
2102
11
  const char *StartOfFilename =
2103
7
      (Slash == StringRef::npos) ? PathStart : 
PathStart + Slash + 14
;
2104
  // Code completion filter range is the filename only, up to completion point.
2105
11
  PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2106
11
      StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2107
  // We should replace the characters up to the closing quote or closest slash,
2108
  // if any.
2109
70
  while (CompletionPoint < BufferEnd) {
2110
70
    char Next = *(CompletionPoint + 1);
2111
70
    if (Next == 0 || Next == '\r' || Next == '\n')
2112
0
      break;
2113
70
    ++CompletionPoint;
2114
70
    if (Next == (IsAngled ? 
'>'45
:
'"'25
))
2115
10
      break;
2116
60
    if (llvm::is_contained(SlashChars, Next))
2117
1
      break;
2118
60
  }
2119
2120
11
  PP->setCodeCompletionTokenRange(
2121
11
      FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2122
11
      FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2123
11
  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2124
11
}
2125
2126
/// LexCharConstant - Lex the remainder of a character constant, after having
2127
/// lexed either ' or L' or u8' or u' or U'.
2128
bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2129
1.47M
                            tok::TokenKind Kind) {
2130
  // Does this character contain the \0 character?
2131
1.47M
  const char *NulCharacter = nullptr;
2132
2133
1.47M
  if (!isLexingRawMode()) {
2134
1.43M
    if (Kind == tok::utf16_char_constant || 
Kind == tok::utf32_char_constant1.43M
)
2135
185
      Diag(BufferPtr, getLangOpts().CPlusPlus
2136
171
                          ? diag::warn_cxx98_compat_unicode_literal
2137
14
                          : diag::warn_c99_compat_unicode_literal);
2138
1.43M
    else if (Kind == tok::utf8_char_constant)
2139
150
      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2140
1.43M
  }
2141
2142
1.47M
  char C = getAndAdvanceChar(CurPtr, Result);
2143
1.47M
  if (C == '\'') {
2144
28
    if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor2
)
2145
0
      Diag(BufferPtr, diag::ext_empty_character);
2146
28
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2147
28
    return true;
2148
28
  }
2149
2150
6.97M
  
while (1.47M
C != '\'') {
2151
    // Skip escaped characters.
2152
5.49M
    if (C == '\\')
2153
17.5k
      C = getAndAdvanceChar(CurPtr, Result);
2154
2155
5.49M
    if (C == '\n' || 
C == '\r'5.49M
|| // Newline.
2156
5.49M
        (C == 0 && 
CurPtr-1 == BufferEnd19
)) { // End of file.
2157
50
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor6
)
2158
3
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2159
50
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2160
50
      return true;
2161
50
    }
2162
2163
5.49M
    if (C == 0) {
2164
14
      if (isCodeCompletionPoint(CurPtr-1)) {
2165
6
        PP->CodeCompleteNaturalLanguage();
2166
6
        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2167
6
        cutOffLexing();
2168
6
        return true;
2169
6
      }
2170
2171
8
      NulCharacter = CurPtr-1;
2172
8
    }
2173
5.49M
    C = getAndAdvanceChar(CurPtr, Result);
2174
5.49M
  }
2175
2176
  // If we are in C++11, lex the optional ud-suffix.
2177
1.47M
  if (getLangOpts().CPlusPlus)
2178
255k
    CurPtr = LexUDSuffix(Result, CurPtr, false);
2179
2180
  // If a nul character existed in the character, warn about it.
2181
1.47M
  if (NulCharacter && 
!isLexingRawMode()8
)
2182
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2183
2184
  // Update the location of token as well as BufferPtr.
2185
1.47M
  const char *TokStart = BufferPtr;
2186
1.47M
  FormTokenWithChars(Result, CurPtr, Kind);
2187
1.47M
  Result.setLiteralData(TokStart);
2188
1.47M
  return true;
2189
1.47M
}
2190
2191
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2192
/// Update BufferPtr to point to the next non-whitespace character and return.
2193
///
2194
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2195
bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2196
224M
                           bool &TokAtPhysicalStartOfLine) {
2197
  // Whitespace - Skip it, then return the token after the whitespace.
2198
224M
  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2199
2200
224M
  unsigned char Char = *CurPtr;
2201
2202
224M
  const char *lastNewLine = nullptr;
2203
239M
  auto setLastNewLine = [&](const char *Ptr) {
2204
239M
    lastNewLine = Ptr;
2205
239M
    if (!NewLinePtr)
2206
192M
      NewLinePtr = Ptr;
2207
239M
  };
2208
224M
  if (SawNewline)
2209
213M
    setLastNewLine(CurPtr - 1);
2210
2211
  // Skip consecutive spaces efficiently.
2212
250M
  while (true) {
2213
    // Skip horizontal whitespace very aggressively.
2214
689M
    while (isHorizontalWhitespace(Char))
2215
438M
      Char = *++CurPtr;
2216
2217
    // Otherwise if we have something other than whitespace, we're done.
2218
250M
    if (!isVerticalWhitespace(Char))
2219
224M
      break;
2220
2221
26.5M
    if (ParsingPreprocessorDirective) {
2222
      // End of preprocessor directive line, let LexTokenInternal handle this.
2223
2.92k
      BufferPtr = CurPtr;
2224
2.92k
      return false;
2225
2.92k
    }
2226
2227
    // OK, but handle newline.
2228
26.5M
    if (*CurPtr == '\n')
2229
26.5M
      setLastNewLine(CurPtr);
2230
26.5M
    SawNewline = true;
2231
26.5M
    Char = *++CurPtr;
2232
26.5M
  }
2233
2234
  // If the client wants us to return whitespace, return it now.
2235
224M
  if (isKeepWhitespaceMode()) {
2236
58.5k
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2237
58.5k
    if (SawNewline) {
2238
57.7k
      IsAtStartOfLine = true;
2239
57.7k
      IsAtPhysicalStartOfLine = true;
2240
57.7k
    }
2241
    // FIXME: The next token will not have LeadingSpace set.
2242
58.5k
    return true;
2243
58.5k
  }
2244
2245
  // If this isn't immediately after a newline, there is leading space.
2246
223M
  char PrevChar = CurPtr[-1];
2247
223M
  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2248
2249
223M
  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2250
223M
  if (SawNewline) {
2251
213M
    Result.setFlag(Token::StartOfLine);
2252
213M
    TokAtPhysicalStartOfLine = true;
2253
2254
213M
    if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && 
PP40.4M
) {
2255
39.5M
      if (auto *Handler = PP->getEmptylineHandler())
2256
87
        Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2257
87
                                             getSourceLocation(lastNewLine)));
2258
39.5M
    }
2259
213M
  }
2260
2261
223M
  BufferPtr = CurPtr;
2262
223M
  return false;
2263
223M
}
2264
2265
/// We have just read the // characters from input.  Skip until we find the
2266
/// newline character that terminates the comment.  Then update BufferPtr and
2267
/// return.
2268
///
2269
/// If we're in KeepCommentMode or any CommentHandler has inserted
2270
/// some tokens, this will store the first token and return true.
2271
bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2272
28.2M
                            bool &TokAtPhysicalStartOfLine) {
2273
  // If Line comments aren't explicitly enabled for this language, emit an
2274
  // extension warning.
2275
28.2M
  if (!LangOpts.LineComment && 
!isLexingRawMode()5.71k
) {
2276
2.19k
    Diag(BufferPtr, diag::ext_line_comment);
2277
2278
    // Mark them enabled so we only emit one warning for this translation
2279
    // unit.
2280
2.19k
    LangOpts.LineComment = true;
2281
2.19k
  }
2282
2283
  // Scan over the body of the comment.  The common case, when scanning, is that
2284
  // the comment contains normal ascii characters with nothing interesting in
2285
  // them.  As such, optimize for this case with the inner loop.
2286
  //
2287
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2288
  // character that ends the line comment.
2289
28.2M
  char C;
2290
28.3M
  while (true) {
2291
28.3M
    C = *CurPtr;
2292
    // Skip over characters in the fast loop.
2293
1.42G
    while (C != 0 &&                // Potentially EOF.
2294
1.42G
           C != '\n' && 
C != '\r'1.40G
) // Newline or DOS-style newline.
2295
1.40G
      C = *++CurPtr;
2296
2297
28.3M
    const char *NextLine = CurPtr;
2298
28.3M
    if (C != 0) {
2299
      // We found a newline, see if it's escaped.
2300
28.3M
      const char *EscapePtr = CurPtr-1;
2301
28.3M
      bool HasSpace = false;
2302
28.4M
      while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2303
40.5k
        --EscapePtr;
2304
40.5k
        HasSpace = true;
2305
40.5k
      }
2306
2307
28.3M
      if (*EscapePtr == '\\')
2308
        // Escaped newline.
2309
101k
        CurPtr = EscapePtr;
2310
28.2M
      else if (EscapePtr[0] == '/' && 
EscapePtr[-1] == '?'3.16M
&&
2311
8
               EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2312
        // Trigraph-escaped newline.
2313
3
        CurPtr = EscapePtr-2;
2314
28.2M
      else
2315
28.2M
        break; // This is a newline, we're done.
2316
2317
      // If there was space between the backslash and newline, warn about it.
2318
101k
      if (HasSpace && 
!isLexingRawMode()8
)
2319
6
        Diag(EscapePtr, diag::backslash_newline_space);
2320
101k
    }
2321
2322
    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2323
    // properly decode the character.  Read it in raw mode to avoid emitting
2324
    // diagnostics about things like trigraphs.  If we see an escaped newline,
2325
    // we'll handle it below.
2326
105k
    const char *OldPtr = CurPtr;
2327
105k
    bool OldRawMode = isLexingRawMode();
2328
105k
    LexingRawMode = true;
2329
105k
    C = getAndAdvanceChar(CurPtr, Result);
2330
105k
    LexingRawMode = OldRawMode;
2331
2332
    // If we only read only one character, then no special handling is needed.
2333
    // We're done and can skip forward to the newline.
2334
105k
    if (C != 0 && 
CurPtr == OldPtr+1101k
) {
2335
0
      CurPtr = NextLine;
2336
0
      break;
2337
0
    }
2338
2339
    // If we read multiple characters, and one of those characters was a \r or
2340
    // \n, then we had an escaped newline within the comment.  Emit diagnostic
2341
    // unless the next line is also a // comment.
2342
105k
    if (CurPtr != OldPtr + 1 && 
C != '/'101k
&&
2343
4.42k
        (CurPtr == BufferEnd + 1 || 
CurPtr[0] != '/'4.42k
)) {
2344
8.82k
      for (; OldPtr != CurPtr; 
++OldPtr4.41k
)
2345
8.82k
        if (OldPtr[0] == '\n' || 
OldPtr[0] == '\r'4.41k
) {
2346
          // Okay, we found a // comment that ends in a newline, if the next
2347
          // line is also a // comment, but has spaces, don't emit a diagnostic.
2348
4.40k
          if (isWhitespace(C)) {
2349
4.23k
            const char *ForwardPtr = CurPtr;
2350
52.7k
            while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2351
48.5k
              ++ForwardPtr;
2352
4.23k
            if (ForwardPtr[0] == '/' && 
ForwardPtr[1] == '/'3.78k
)
2353
3.78k
              break;
2354
622
          }
2355
2356
622
          if (!isLexingRawMode())
2357
570
            Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2358
622
          break;
2359
622
        }
2360
4.40k
    }
2361
2362
105k
    if (C == '\r' || C == '\n' || 
CurPtr == BufferEnd + 1105k
) {
2363
3.69k
      --CurPtr;
2364
3.69k
      break;
2365
3.69k
    }
2366
2367
101k
    if (C == '\0' && 
isCodeCompletionPoint(CurPtr-1)9
) {
2368
8
      PP->CodeCompleteNaturalLanguage();
2369
8
      cutOffLexing();
2370
8
      return false;
2371
8
    }
2372
101k
  }
2373
2374
  // Found but did not consume the newline.  Notify comment handlers about the
2375
  // comment unless we're in a #if 0 block.
2376
28.2M
  if (PP && 
!isLexingRawMode()28.1M
&&
2377
16.0M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2378
0
                                            getSourceLocation(CurPtr)))) {
2379
0
    BufferPtr = CurPtr;
2380
0
    return true; // A token has to be returned.
2381
0
  }
2382
2383
  // If we are returning comments as tokens, return this comment as a token.
2384
28.2M
  if (inKeepCommentMode())
2385
40.4k
    return SaveLineComment(Result, CurPtr);
2386
2387
  // If we are inside a preprocessor directive and we see the end of line,
2388
  // return immediately, so that the lexer can return this as an EOD token.
2389
28.2M
  if (ParsingPreprocessorDirective || 
CurPtr == BufferEnd25.6M
) {
2390
2.54M
    BufferPtr = CurPtr;
2391
2.54M
    return false;
2392
2.54M
  }
2393
2394
  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2395
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2396
  // contribute to another token), it isn't needed for correctness.  Note that
2397
  // this is ok even in KeepWhitespaceMode, because we would have returned the
2398
  /// comment above in that mode.
2399
25.6M
  NewLinePtr = CurPtr++;
2400
2401
  // The next returned token is at the start of the line.
2402
25.6M
  Result.setFlag(Token::StartOfLine);
2403
25.6M
  TokAtPhysicalStartOfLine = true;
2404
  // No leading whitespace seen so far.
2405
25.6M
  Result.clearFlag(Token::LeadingSpace);
2406
25.6M
  BufferPtr = CurPtr;
2407
25.6M
  return false;
2408
25.6M
}
2409
2410
/// If in save-comment mode, package up this Line comment in an appropriate
2411
/// way and return it.
2412
40.4k
bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2413
  // If we're not in a preprocessor directive, just return the // comment
2414
  // directly.
2415
40.4k
  FormTokenWithChars(Result, CurPtr, tok::comment);
2416
2417
40.4k
  if (!ParsingPreprocessorDirective || 
LexingRawMode2
)
2418
40.4k
    return true;
2419
2420
  // If this Line-style comment is in a macro definition, transmogrify it into
2421
  // a C-style block comment.
2422
2
  bool Invalid = false;
2423
2
  std::string Spelling = PP->getSpelling(Result, &Invalid);
2424
2
  if (Invalid)
2425
0
    return true;
2426
2427
2
  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2428
2
  Spelling[1] = '*';   // Change prefix to "/*".
2429
2
  Spelling += "*/";    // add suffix.
2430
2431
2
  Result.setKind(tok::comment);
2432
2
  PP->CreateString(Spelling, Result,
2433
2
                   Result.getLocation(), Result.getLocation());
2434
2
  return true;
2435
2
}
2436
2437
/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2438
/// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2439
/// a diagnostic if so.  We know that the newline is inside of a block comment.
2440
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2441
80.7k
                                                  Lexer *L) {
2442
80.7k
  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2443
2444
  // Back up off the newline.
2445
80.7k
  --CurPtr;
2446
2447
  // If this is a two-character newline sequence, skip the other character.
2448
80.7k
  if (CurPtr[0] == '\n' || 
CurPtr[0] == '\r'9.65k
) {
2449
    // \n\n or \r\r -> not escaped newline.
2450
71.1k
    if (CurPtr[0] == CurPtr[1])
2451
71.1k
      return false;
2452
    // \n\r or \r\n -> skip the newline.
2453
0
    --CurPtr;
2454
0
  }
2455
2456
  // If we have horizontal whitespace, skip over it.  We allow whitespace
2457
  // between the slash and newline.
2458
9.65k
  bool HasSpace = false;
2459
9.68k
  while (isHorizontalWhitespace(*CurPtr) || 
*CurPtr == 09.65k
) {
2460
33
    --CurPtr;
2461
33
    HasSpace = true;
2462
33
  }
2463
2464
  // If we have a slash, we know this is an escaped newline.
2465
9.65k
  if (*CurPtr == '\\') {
2466
8
    if (CurPtr[-1] != '*') 
return false0
;
2467
9.64k
  } else {
2468
    // It isn't a slash, is it the ?? / trigraph?
2469
9.64k
    if (CurPtr[0] != '/' || 
CurPtr[-1] != '?'1.72k
||
CurPtr[-2] != '?'5
||
2470
5
        CurPtr[-3] != '*')
2471
9.64k
      return false;
2472
2473
    // This is the trigraph ending the comment.  Emit a stern warning!
2474
5
    CurPtr -= 2;
2475
2476
    // If no trigraphs are enabled, warn that we ignored this trigraph and
2477
    // ignore this * character.
2478
5
    if (!L->getLangOpts().Trigraphs) {
2479
0
      if (!L->isLexingRawMode())
2480
0
        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2481
0
      return false;
2482
0
    }
2483
5
    if (!L->isLexingRawMode())
2484
5
      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2485
5
  }
2486
2487
  // Warn about having an escaped newline between the */ characters.
2488
13
  if (!L->isLexingRawMode())
2489
10
    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2490
2491
  // If there was space between the backslash and newline, warn about it.
2492
13
  if (HasSpace && !L->isLexingRawMode())
2493
10
    L->Diag(CurPtr, diag::backslash_newline_space);
2494
2495
13
  return true;
2496
9.65k
}
2497
2498
#ifdef __SSE2__
2499
#include <emmintrin.h>
2500
#elif __ALTIVEC__
2501
#include <altivec.h>
2502
#undef bool
2503
#endif
2504
2505
/// We have just read from input the / and * characters that started a comment.
2506
/// Read until we find the * and / characters that terminate the comment.
2507
/// Note that we don't bother decoding trigraphs or escaped newlines in block
2508
/// comments, because they cannot cause the comment to end.  The only thing
2509
/// that can happen is the comment could end with an escaped newline between
2510
/// the terminating * and /.
2511
///
2512
/// If we're in KeepCommentMode or any CommentHandler has inserted
2513
/// some tokens, this will store the first token and return true.
2514
bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2515
46.3M
                             bool &TokAtPhysicalStartOfLine) {
2516
  // Scan one character past where we should, looking for a '/' character.  Once
2517
  // we find it, check to see if it was preceded by a *.  This common
2518
  // optimization helps people who like to put a lot of * characters in their
2519
  // comments.
2520
2521
  // The first character we get with newlines and trigraphs skipped to handle
2522
  // the degenerate /*/ case below correctly if the * has an escaped newline
2523
  // after it.
2524
46.3M
  unsigned CharSize;
2525
46.3M
  unsigned char C = getCharAndSize(CurPtr, CharSize);
2526
46.3M
  CurPtr += CharSize;
2527
46.3M
  if (C == 0 && 
CurPtr == BufferEnd+14
) {
2528
3
    if (!isLexingRawMode())
2529
0
      Diag(BufferPtr, diag::err_unterminated_block_comment);
2530
3
    --CurPtr;
2531
2532
    // KeepWhitespaceMode should return this broken comment as a token.  Since
2533
    // it isn't a well formed comment, just return it as an 'unknown' token.
2534
3
    if (isKeepWhitespaceMode()) {
2535
2
      FormTokenWithChars(Result, CurPtr, tok::unknown);
2536
2
      return true;
2537
2
    }
2538
2539
1
    BufferPtr = CurPtr;
2540
1
    return false;
2541
1
  }
2542
2543
  // Check to see if the first character after the '/*' is another /.  If so,
2544
  // then this slash does not end the block comment, it is part of it.
2545
46.3M
  if (C == '/')
2546
583
    C = *CurPtr++;
2547
2548
54.1M
  while (true) {
2549
    // Skip over all non-interesting characters until we find end of buffer or a
2550
    // (probably ending) '/' character.
2551
54.1M
    if (CurPtr + 24 < BufferEnd &&
2552
        // If there is a code-completion point avoid the fast scan because it
2553
        // doesn't check for '\0'.
2554
53.6M
        !(PP && 
PP->getCodeCompletionFileLoc() == FileLoc53.6M
)) {
2555
      // While not aligned to a 16-byte boundary.
2556
427M
      while (C != '/' && 
((intptr_t)CurPtr & 0x0F) != 0423M
)
2557
373M
        C = *CurPtr++;
2558
2559
53.6M
      if (C == '/') 
goto FoundSlash3.88M
;
2560
2561
49.7M
#ifdef __SSE2__
2562
49.7M
      __m128i Slashes = _mm_set1_epi8('/');
2563
416M
      while (CurPtr+16 <= BufferEnd) {
2564
416M
        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2565
416M
                                    Slashes));
2566
416M
        if (cmp != 0) {
2567
          // Adjust the pointer to point directly after the first slash. It's
2568
          // not necessary to set C here, it will be overwritten at the end of
2569
          // the outer loop.
2570
49.6M
          CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2571
49.6M
          goto FoundSlash;
2572
49.6M
        }
2573
367M
        CurPtr += 16;
2574
367M
      }
2575
#elif __ALTIVEC__
2576
      __vector unsigned char Slashes = {
2577
        '/', '/', '/', '/',  '/', '/', '/', '/',
2578
        '/', '/', '/', '/',  '/', '/', '/', '/'
2579
      };
2580
      while (CurPtr + 16 <= BufferEnd &&
2581
             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
2582
        CurPtr += 16;
2583
#else
2584
      // Scan for '/' quickly.  Many block comments are very large.
2585
      while (CurPtr[0] != '/' &&
2586
             CurPtr[1] != '/' &&
2587
             CurPtr[2] != '/' &&
2588
             CurPtr[3] != '/' &&
2589
             CurPtr+4 < BufferEnd) {
2590
        CurPtr += 4;
2591
      }
2592
#endif
2593
2594
      // It has to be one of the bytes scanned, increment to it and read one.
2595
99.7k
      C = *CurPtr++;
2596
99.7k
    }
2597
2598
    // Loop to scan the remainder.
2599
8.25M
    
while (557k
C != '/' &&
C != '\0'7.70M
)
2600
7.70M
      C = *CurPtr++;
2601
2602
557k
    if (C == '/') {
2603
54.1M
  FoundSlash:
2604
54.1M
      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2605
46.3M
        break;
2606
2607
7.77M
      if ((CurPtr[-2] == '\n' || 
CurPtr[-2] == '\r'7.69M
)) {
2608
80.7k
        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2609
          // We found the final */, though it had an escaped newline between the
2610
          // * and /.  We're done!
2611
13
          break;
2612
13
        }
2613
7.77M
      }
2614
7.77M
      if (CurPtr[0] == '*' && 
CurPtr[1] != '/'582
) {
2615
        // If this is a /* inside of the comment, emit a warning.  Don't do this
2616
        // if this is a /*/, which will end the comment.  This misses cases with
2617
        // embedded escaped newlines, but oh well.
2618
10
        if (!isLexingRawMode())
2619
9
          Diag(CurPtr-1, diag::warn_nested_block_comment);
2620
10
      }
2621
14
    } else if (C == 0 && CurPtr == BufferEnd+1) {
2622
2
      if (!isLexingRawMode())
2623
2
        Diag(BufferPtr, diag::err_unterminated_block_comment);
2624
      // Note: the user probably forgot a */.  We could continue immediately
2625
      // after the /*, but this would involve lexing a lot of what really is the
2626
      // comment, which surely would confuse the parser.
2627
2
      --CurPtr;
2628
2629
      // KeepWhitespaceMode should return this broken comment as a token.  Since
2630
      // it isn't a well formed comment, just return it as an 'unknown' token.
2631
2
      if (isKeepWhitespaceMode()) {
2632
0
        FormTokenWithChars(Result, CurPtr, tok::unknown);
2633
0
        return true;
2634
0
      }
2635
2636
2
      BufferPtr = CurPtr;
2637
2
      return false;
2638
12
    } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2639
9
      PP->CodeCompleteNaturalLanguage();
2640
9
      cutOffLexing();
2641
9
      return false;
2642
9
    }
2643
2644
7.77M
    C = *CurPtr++;
2645
7.77M
  }
2646
2647
  // Notify comment handlers about the comment unless we're in a #if 0 block.
2648
46.3M
  if (PP && 
!isLexingRawMode()46.3M
&&
2649
41.3M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2650
0
                                            getSourceLocation(CurPtr)))) {
2651
0
    BufferPtr = CurPtr;
2652
0
    return true; // A token has to be returned.
2653
0
  }
2654
2655
  // If we are returning comments as tokens, return this comment as a token.
2656
46.3M
  if (inKeepCommentMode()) {
2657
2.36k
    FormTokenWithChars(Result, CurPtr, tok::comment);
2658
2.36k
    return true;
2659
2.36k
  }
2660
2661
  // It is common for the tokens immediately after a /**/ comment to be
2662
  // whitespace.  Instead of going through the big switch, handle it
2663
  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2664
  // have already returned above with the comment as a token.
2665
46.3M
  if (isHorizontalWhitespace(*CurPtr)) {
2666
167k
    SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2667
167k
    return false;
2668
167k
  }
2669
2670
  // Otherwise, just return so that the next character will be lexed as a token.
2671
46.1M
  BufferPtr = CurPtr;
2672
46.1M
  Result.setFlag(Token::LeadingSpace);
2673
46.1M
  return false;
2674
46.1M
}
2675
2676
//===----------------------------------------------------------------------===//
2677
// Primary Lexing Entry Points
2678
//===----------------------------------------------------------------------===//
2679
2680
/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2681
/// uninterpreted string.  This switches the lexer out of directive mode.
2682
96.1k
void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2683
96.1k
  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2684
96.1k
         "Must be in a preprocessing directive!");
2685
96.1k
  Token Tmp;
2686
96.1k
  Tmp.startToken();
2687
2688
  // CurPtr - Cache BufferPtr in an automatic variable.
2689
96.1k
  const char *CurPtr = BufferPtr;
2690
1.87M
  while (true) {
2691
1.87M
    char Char = getAndAdvanceChar(CurPtr, Tmp);
2692
1.87M
    switch (Char) {
2693
1.77M
    default:
2694
1.77M
      if (Result)
2695
3.59k
        Result->push_back(Char);
2696
1.77M
      break;
2697
18
    case 0:  // Null.
2698
      // Found end of file?
2699
18
      if (CurPtr-1 != BufferEnd) {
2700
18
        if (isCodeCompletionPoint(CurPtr-1)) {
2701
18
          PP->CodeCompleteNaturalLanguage();
2702
18
          cutOffLexing();
2703
18
          return;
2704
18
        }
2705
2706
        // Nope, normal character, continue.
2707
0
        if (Result)
2708
0
          Result->push_back(Char);
2709
0
        break;
2710
0
      }
2711
      // FALL THROUGH.
2712
0
      LLVM_FALLTHROUGH;
2713
96.1k
    case '\r':
2714
96.1k
    case '\n':
2715
      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2716
96.1k
      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2717
96.1k
      BufferPtr = CurPtr-1;
2718
2719
      // Next, lex the character, which should handle the EOD transition.
2720
96.1k
      Lex(Tmp);
2721
96.1k
      if (Tmp.is(tok::code_completion)) {
2722
0
        if (PP)
2723
0
          PP->CodeCompleteNaturalLanguage();
2724
0
        Lex(Tmp);
2725
0
      }
2726
96.1k
      assert(Tmp.is(tok::eod) && "Unexpected token!");
2727
2728
      // Finally, we're done;
2729
96.1k
      return;
2730
1.87M
    }
2731
1.87M
  }
2732
96.1k
}
2733
2734
/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2735
/// condition, reporting diagnostics and handling other edge cases as required.
2736
/// This returns true if Result contains a token, false if PP.Lex should be
2737
/// called again.
2738
1.80M
bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2739
  // If we hit the end of the file while parsing a preprocessor directive,
2740
  // end the preprocessor directive first.  The next token returned will
2741
  // then be the end of file.
2742
1.80M
  if (ParsingPreprocessorDirective) {
2743
    // Done parsing the "line".
2744
1.81k
    ParsingPreprocessorDirective = false;
2745
    // Update the location of token as well as BufferPtr.
2746
1.81k
    FormTokenWithChars(Result, CurPtr, tok::eod);
2747
2748
    // Restore comment saving mode, in case it was disabled for directive.
2749
1.81k
    if (PP)
2750
1.81k
      resetExtendedTokenMode();
2751
1.81k
    return true;  // Have a token.
2752
1.81k
  }
2753
2754
  // If we are in raw mode, return this event as an EOF token.  Let the caller
2755
  // that put us in raw mode handle the event.
2756
1.80M
  if (isLexingRawMode()) {
2757
102k
    Result.startToken();
2758
102k
    BufferPtr = BufferEnd;
2759
102k
    FormTokenWithChars(Result, BufferEnd, tok::eof);
2760
102k
    return true;
2761
102k
  }
2762
2763
1.69M
  if (PP->isRecordingPreamble() && 
PP->isInPrimaryFile()254
) {
2764
90
    PP->setRecordedPreambleConditionalStack(ConditionalStack);
2765
90
    ConditionalStack.clear();
2766
90
  }
2767
2768
  // Issue diagnostics for unterminated #if and missing newline.
2769
2770
  // If we are in a #if directive, emit an error.
2771
1.70M
  while (!ConditionalStack.empty()) {
2772
8
    if (PP->getCodeCompletionFileLoc() != FileLoc)
2773
8
      PP->Diag(ConditionalStack.back().IfLoc,
2774
8
               diag::err_pp_unterminated_conditional);
2775
8
    ConditionalStack.pop_back();
2776
8
  }
2777
2778
  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2779
  // a pedwarn.
2780
1.69M
  if (CurPtr != BufferStart && 
(1.69M
CurPtr[-1] != '\n'1.69M
&&
CurPtr[-1] != '\r'26.0k
)) {
2781
26.0k
    DiagnosticsEngine &Diags = PP->getDiagnostics();
2782
26.0k
    SourceLocation EndLoc = getSourceLocation(BufferEnd);
2783
26.0k
    unsigned DiagID;
2784
2785
26.0k
    if (LangOpts.CPlusPlus11) {
2786
      // C++11 [lex.phases] 2.2 p2
2787
      // Prefer the C++98 pedantic compatibility warning over the generic,
2788
      // non-extension, user-requested "missing newline at EOF" warning.
2789
18.6k
      if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2790
2
        DiagID = diag::warn_cxx98_compat_no_newline_eof;
2791
18.6k
      } else {
2792
18.6k
        DiagID = diag::warn_no_newline_eof;
2793
18.6k
      }
2794
7.45k
    } else {
2795
7.45k
      DiagID = diag::ext_no_newline_eof;
2796
7.45k
    }
2797
2798
26.0k
    Diag(BufferEnd, DiagID)
2799
26.0k
      << FixItHint::CreateInsertion(EndLoc, "\n");
2800
26.0k
  }
2801
2802
1.69M
  BufferPtr = CurPtr;
2803
2804
  // Finally, let the preprocessor handle this.
2805
1.69M
  return PP->HandleEndOfFile(Result, isPragmaLexer());
2806
1.69M
}
2807
2808
/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2809
/// the specified lexer will return a tok::l_paren token, 0 if it is something
2810
/// else and 2 if there are no more tokens in the buffer controlled by the
2811
/// lexer.
2812
7.08M
unsigned Lexer::isNextPPTokenLParen() {
2813
7.08M
  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2814
2815
  // Switch to 'skipping' mode.  This will ensure that we can lex a token
2816
  // without emitting diagnostics, disables macro expansion, and will cause EOF
2817
  // to return an EOF token instead of popping the include stack.
2818
7.08M
  LexingRawMode = true;
2819
2820
  // Save state that can be changed while lexing so that we can restore it.
2821
7.08M
  const char *TmpBufferPtr = BufferPtr;
2822
7.08M
  bool inPPDirectiveMode = ParsingPreprocessorDirective;
2823
7.08M
  bool atStartOfLine = IsAtStartOfLine;
2824
7.08M
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2825
7.08M
  bool leadingSpace = HasLeadingSpace;
2826
2827
7.08M
  Token Tok;
2828
7.08M
  Lex(Tok);
2829
2830
  // Restore state that may have changed.
2831
7.08M
  BufferPtr = TmpBufferPtr;
2832
7.08M
  ParsingPreprocessorDirective = inPPDirectiveMode;
2833
7.08M
  HasLeadingSpace = leadingSpace;
2834
7.08M
  IsAtStartOfLine = atStartOfLine;
2835
7.08M
  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2836
2837
  // Restore the lexer back to non-skipping mode.
2838
7.08M
  LexingRawMode = false;
2839
2840
7.08M
  if (Tok.is(tok::eof))
2841
3
    return 2;
2842
7.08M
  return Tok.is(tok::l_paren);
2843
7.08M
}
2844
2845
/// Find the end of a version control conflict marker.
2846
static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2847
10
                                   ConflictMarkerKind CMK) {
2848
5
  const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2849
5
  size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2850
10
  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2851
10
  size_t Pos = RestOfBuffer.find(Terminator);
2852
11
  while (Pos != StringRef::npos) {
2853
    // Must occur at start of line.
2854
8
    if (Pos == 0 ||
2855
7
        (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
2856
1
      RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2857
1
      Pos = RestOfBuffer.find(Terminator);
2858
1
      continue;
2859
1
    }
2860
7
    return RestOfBuffer.data()+Pos;
2861
7
  }
2862
3
  return nullptr;
2863
10
}
2864
2865
/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2866
/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2867
/// and recover nicely.  This returns true if it is a conflict marker and false
2868
/// if not.
2869
8.83k
bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2870
  // Only a conflict marker if it starts at the beginning of a line.
2871
8.83k
  if (CurPtr != BufferStart &&
2872
8.83k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'8.80k
)
2873
8.80k
    return false;
2874
2875
  // Check to see if we have <<<<<<< or >>>>.
2876
28
  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2877
19
      !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> "))
2878
14
    return false;
2879
2880
  // If we have a situation where we don't care about conflict markers, ignore
2881
  // it.
2882
14
  if (CurrentConflictMarkerState || isLexingRawMode())
2883
9
    return false;
2884
2885
5
  ConflictMarkerKind Kind = *CurPtr == '<' ? 
CMK_Normal3
:
CMK_Perforce2
;
2886
2887
  // Check to see if there is an ending marker somewhere in the buffer at the
2888
  // start of a line to terminate this conflict marker.
2889
5
  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2890
    // We found a match.  We are really in a conflict marker.
2891
    // Diagnose this, and ignore to the end of line.
2892
4
    Diag(CurPtr, diag::err_conflict_marker);
2893
4
    CurrentConflictMarkerState = Kind;
2894
2895
    // Skip ahead to the end of line.  We know this exists because the
2896
    // end-of-conflict marker starts with \r or \n.
2897
76
    while (*CurPtr != '\r' && *CurPtr != '\n') {
2898
72
      assert(CurPtr != BufferEnd && "Didn't find end of line");
2899
72
      ++CurPtr;
2900
72
    }
2901
4
    BufferPtr = CurPtr;
2902
4
    return true;
2903
4
  }
2904
2905
  // No end of conflict marker found.
2906
1
  return false;
2907
1
}
2908
2909
/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2910
/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2911
/// is the end of a conflict marker.  Handle it by ignoring up until the end of
2912
/// the line.  This returns true if it is a conflict marker and false if not.
2913
8.88k
bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2914
  // Only a conflict marker if it starts at the beginning of a line.
2915
8.88k
  if (CurPtr != BufferStart &&
2916
8.88k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'8.84k
)
2917
8.84k
    return false;
2918
2919
  // If we have a situation where we don't care about conflict markers, ignore
2920
  // it.
2921
45
  if (!CurrentConflictMarkerState || 
isLexingRawMode()5
)
2922
40
    return false;
2923
2924
  // Check to see if we have the marker (4 characters in a row).
2925
20
  
for (unsigned i = 1; 5
i != 4;
++i15
)
2926
15
    if (CurPtr[i] != CurPtr[0])
2927
0
      return false;
2928
2929
  // If we do have it, search for the end of the conflict marker.  This could
2930
  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
2931
  // be the end of conflict marker.
2932
5
  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2933
3
                                        CurrentConflictMarkerState)) {
2934
3
    CurPtr = End;
2935
2936
    // Skip ahead to the end of line.
2937
37
    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2938
34
      ++CurPtr;
2939
2940
3
    BufferPtr = CurPtr;
2941
2942
    // No longer in the conflict marker.
2943
3
    CurrentConflictMarkerState = CMK_None;
2944
3
    return true;
2945
3
  }
2946
2947
2
  return false;
2948
2
}
2949
2950
static const char *findPlaceholderEnd(const char *CurPtr,
2951
43
                                      const char *BufferEnd) {
2952
43
  if (CurPtr == BufferEnd)
2953
0
    return nullptr;
2954
43
  BufferEnd -= 1; // Scan until the second last character.
2955
404
  for (; CurPtr != BufferEnd; 
++CurPtr361
) {
2956
404
    if (CurPtr[0] == '#' && 
CurPtr[1] == '>'43
)
2957
43
      return CurPtr + 2;
2958
404
  }
2959
0
  return nullptr;
2960
43
}
2961
2962
45
bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2963
45
  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
2964
45
  if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || 
LexingRawMode44
)
2965
2
    return false;
2966
43
  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2967
43
  if (!End)
2968
0
    return false;
2969
43
  const char *Start = CurPtr - 1;
2970
43
  if (!LangOpts.AllowEditorPlaceholders)
2971
22
    Diag(Start, diag::err_placeholder_in_source);
2972
43
  Result.startToken();
2973
43
  FormTokenWithChars(Result, End, tok::raw_identifier);
2974
43
  Result.setRawIdentifierData(Start);
2975
43
  PP->LookUpIdentifierInfo(Result);
2976
43
  Result.setFlag(Token::IsEditorPlaceholder);
2977
43
  BufferPtr = End;
2978
43
  return true;
2979
43
}
2980
2981
496M
bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2982
496M
  if (
PP496M
&& PP->isCodeCompletionEnabled()) {
2983
1.02M
    SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2984
1.02M
    return Loc == PP->getCodeCompletionLoc();
2985
1.02M
  }
2986
2987
495M
  return false;
2988
495M
}
2989
2990
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2991
1.33k
                           Token *Result) {
2992
1.33k
  unsigned CharSize;
2993
1.33k
  char Kind = getCharAndSize(StartPtr, CharSize);
2994
2995
1.33k
  unsigned NumHexDigits;
2996
1.33k
  if (Kind == 'u')
2997
259
    NumHexDigits = 4;
2998
1.07k
  else if (Kind == 'U')
2999
36
    NumHexDigits = 8;
3000
1.03k
  else
3001
1.03k
    return 0;
3002
3003
295
  if (!LangOpts.CPlusPlus && 
!LangOpts.C99142
) {
3004
5
    if (Result && 
!isLexingRawMode()3
)
3005
3
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3006
5
    return 0;
3007
5
  }
3008
3009
290
  const char *CurPtr = StartPtr + CharSize;
3010
290
  const char *KindLoc = &CurPtr[-1];
3011
3012
290
  uint32_t CodePoint = 0;
3013
1.52k
  for (unsigned i = 0; i < NumHexDigits; 
++i1.23k
) {
3014
1.25k
    char C = getCharAndSize(CurPtr, CharSize);
3015
3016
1.25k
    unsigned Value = llvm::hexDigitValue(C);
3017
1.25k
    if (Value == -1U) {
3018
21
      if (Result && 
!isLexingRawMode()18
) {
3019
18
        if (i == 0) {
3020
6
          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
3021
6
            << StringRef(KindLoc, 1);
3022
12
        } else {
3023
12
          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
3024
3025
          // If the user wrote \U1234, suggest a fixit to \u.
3026
12
          if (i == 4 && 
NumHexDigits == 83
) {
3027
3
            CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3028
3
            Diag(KindLoc, diag::note_ucn_four_not_eight)
3029
3
              << FixItHint::CreateReplacement(URange, "u");
3030
3
          }
3031
12
        }
3032
18
      }
3033
3034
21
      return 0;
3035
21
    }
3036
3037
1.23k
    CodePoint <<= 4;
3038
1.23k
    CodePoint += Value;
3039
3040
1.23k
    CurPtr += CharSize;
3041
1.23k
  }
3042
3043
269
  if (Result) {
3044
120
    Result->setFlag(Token::HasUCN);
3045
120
    if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3046
0
      StartPtr = CurPtr;
3047
120
    else
3048
784
      
while (120
StartPtr != CurPtr)
3049
664
        (void)getAndAdvanceChar(StartPtr, *Result);
3050
149
  } else {
3051
149
    StartPtr = CurPtr;
3052
149
  }
3053
3054
  // Don't apply C family restrictions to UCNs in assembly mode
3055
269
  if (LangOpts.AsmPreprocessor)
3056
6
    return CodePoint;
3057
3058
  // C99 6.4.3p2: A universal character name shall not specify a character whose
3059
  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3060
  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
3061
  // C++11 [lex.charset]p2: If the hexadecimal value for a
3062
  //   universal-character-name corresponds to a surrogate code point (in the
3063
  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3064
  //   if the hexadecimal value for a universal-character-name outside the
3065
  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3066
  //   string literal corresponds to a control character (in either of the
3067
  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3068
  //   basic source character set, the program is ill-formed.
3069
263
  if (CodePoint < 0xA0) {
3070
27
    if (CodePoint == 0x24 || 
CodePoint == 0x4024
||
CodePoint == 0x6024
)
3071
3
      return CodePoint;
3072
3073
    // We don't use isLexingRawMode() here because we need to warn about bad
3074
    // UCNs even when skipping preprocessing tokens in a #if block.
3075
24
    if (Result && 
PP23
) {
3076
23
      if (CodePoint < 0x20 || 
CodePoint >= 0x7F15
)
3077
16
        Diag(BufferPtr, diag::err_ucn_control_character);
3078
7
      else {
3079
7
        char C = static_cast<char>(CodePoint);
3080
7
        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3081
7
      }
3082
23
    }
3083
3084
24
    return 0;
3085
236
  } else if (CodePoint >= 0xD800 && 
CodePoint <= 0xDFFF30
) {
3086
    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3087
    // We don't use isLexingRawMode() here because we need to diagnose bad
3088
    // UCNs even when skipping preprocessing tokens in a #if block.
3089
4
    if (Result && PP) {
3090
4
      if (LangOpts.CPlusPlus && 
!LangOpts.CPlusPlus112
)
3091
1
        Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3092
3
      else
3093
3
        Diag(BufferPtr, diag::err_ucn_escape_invalid);
3094
4
    }
3095
4
    return 0;
3096
4
  }
3097
3098
232
  return CodePoint;
3099
232
}
3100
3101
bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3102
194
                                   const char *CurPtr) {
3103
194
  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3104
194
      UnicodeWhitespaceCharRanges);
3105
194
  if (!isLexingRawMode() && 
!PP->isPreprocessedOutput()169
&&
3106
155
      UnicodeWhitespaceChars.contains(C)) {
3107
6
    Diag(BufferPtr, diag::ext_unicode_whitespace)
3108
6
      << makeCharRange(*this, BufferPtr, CurPtr);
3109
3110
6
    Result.setFlag(Token::LeadingSpace);
3111
6
    return true;
3112
6
  }
3113
188
  return false;
3114
188
}
3115
3116
188
bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3117
188
  if (isAllowedIDChar(C, LangOpts) && 
isAllowedInitiallyIDChar(C, LangOpts)125
) {
3118
119
    if (!isLexingRawMode() && 
!ParsingPreprocessorDirective106
&&
3119
91
        !PP->isPreprocessedOutput()) {
3120
82
      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3121
82
                                makeCharRange(*this, BufferPtr, CurPtr),
3122
82
                                /*IsFirst=*/true);
3123
82
      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3124
82
                                 makeCharRange(*this, BufferPtr, CurPtr));
3125
82
    }
3126
3127
119
    MIOpt.ReadToken();
3128
119
    return LexIdentifier(Result, CurPtr);
3129
119
  }
3130
3131
69
  if (!isLexingRawMode() && 
!ParsingPreprocessorDirective57
&&
3132
45
      !PP->isPreprocessedOutput() &&
3133
41
      !isASCII(*BufferPtr) && 
!isAllowedIDChar(C, LangOpts)25
) {
3134
    // Non-ASCII characters tend to creep into source code unintentionally.
3135
    // Instead of letting the parser complain about the unknown token,
3136
    // just drop the character.
3137
    // Note that we can /only/ do this when the non-ASCII character is actually
3138
    // spelled as Unicode, not written as a UCN. The standard requires that
3139
    // we not throw away any possible preprocessor tokens, but there's a
3140
    // loophole in the mapping of Unicode characters to basic character set
3141
    // characters that allows us to map these particular characters to, say,
3142
    // whitespace.
3143
22
    Diag(BufferPtr, diag::err_non_ascii)
3144
22
      << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3145
3146
22
    BufferPtr = CurPtr;
3147
22
    return false;
3148
22
  }
3149
3150
  // Otherwise, we have an explicit UCN or a character that's unlikely to show
3151
  // up by accident.
3152
47
  MIOpt.ReadToken();
3153
47
  FormTokenWithChars(Result, CurPtr, tok::unknown);
3154
47
  return true;
3155
47
}
3156
3157
27.4M
void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3158
27.4M
  IsAtStartOfLine = Result.isAtStartOfLine();
3159
27.4M
  HasLeadingSpace = Result.hasLeadingSpace();
3160
27.4M
  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3161
  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3162
27.4M
}
3163
3164
1.96G
bool Lexer::Lex(Token &Result) {
3165
  // Start a new token.
3166
1.96G
  Result.startToken();
3167
3168
  // Set up misc whitespace flags for LexTokenInternal.
3169
1.96G
  if (IsAtStartOfLine) {
3170
148M
    Result.setFlag(Token::StartOfLine);
3171
148M
    IsAtStartOfLine = false;
3172
148M
  }
3173
3174
1.96G
  if (HasLeadingSpace) {
3175
927k
    Result.setFlag(Token::LeadingSpace);
3176
927k
    HasLeadingSpace = false;
3177
927k
  }
3178
3179
1.96G
  if (HasLeadingEmptyMacro) {
3180
1.32M
    Result.setFlag(Token::LeadingEmptyMacro);
3181
1.32M
    HasLeadingEmptyMacro = false;
3182
1.32M
  }
3183
3184
1.96G
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3185
1.96G
  IsAtPhysicalStartOfLine = false;
3186
1.96G
  bool isRawLex = isLexingRawMode();
3187
1.96G
  (void) isRawLex;
3188
1.96G
  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3189
  // (After the LexTokenInternal call, the lexer might be destroyed.)
3190
1.96G
  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3191
1.96G
  return returnedToken;
3192
1.96G
}
3193
3194
/// LexTokenInternal - This implements a simple C family lexer.  It is an
3195
/// extremely performance critical piece of code.  This assumes that the buffer
3196
/// has a null character at the end of the file.  This returns a preprocessing
3197
/// token, not a normal token, as such, it is an internal interface.  It assumes
3198
/// that the Flags of result have been cleared before calling this.
3199
1.96G
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3200
2.24G
LexNextToken:
3201
  // New token, can't need cleaning yet.
3202
2.24G
  Result.clearFlag(Token::NeedsCleaning);
3203
2.24G
  Result.setIdentifierInfo(nullptr);
3204
3205
  // CurPtr - Cache BufferPtr in an automatic variable.
3206
2.24G
  const char *CurPtr = BufferPtr;
3207
3208
  // Small amounts of horizontal whitespace is very common between tokens.
3209
2.24G
  if ((*CurPtr == ' ') || 
(*CurPtr == '\t')1.65G
) {
3210
598M
    ++CurPtr;
3211
1.13G
    while ((*CurPtr == ' ') || 
(*CurPtr == '\t')607M
)
3212
535M
      ++CurPtr;
3213
3214
    // If we are keeping whitespace and other tokens, just return what we just
3215
    // skipped.  The next lexer invocation will return the token after the
3216
    // whitespace.
3217
598M
    if (isKeepWhitespaceMode()) {
3218
243k
      FormTokenWithChars(Result, CurPtr, tok::unknown);
3219
      // FIXME: The next token will not have LeadingSpace set.
3220
243k
      return true;
3221
243k
    }
3222
3223
597M
    BufferPtr = CurPtr;
3224
597M
    Result.setFlag(Token::LeadingSpace);
3225
597M
  }
3226
3227
2.24G
  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3228
3229
  // Read a character, advancing over it.
3230
2.24G
  char Char = getAndAdvanceChar(CurPtr, Result);
3231
2.24G
  tok::TokenKind Kind;
3232
3233
2.24G
  if (!isVerticalWhitespace(Char))
3234
1.93G
    NewLinePtr = nullptr;
3235
3236
2.24G
  switch (Char) {
3237
1.80M
  case 0:  // Null.
3238
    // Found end of file?
3239
1.80M
    if (CurPtr-1 == BufferEnd)
3240
1.80M
      return LexEndOfFile(Result, CurPtr-1);
3241
3242
    // Check if we are performing code completion.
3243
1.09k
    if (isCodeCompletionPoint(CurPtr-1)) {
3244
      // Return the code-completion token.
3245
1.08k
      Result.startToken();
3246
1.08k
      FormTokenWithChars(Result, CurPtr, tok::code_completion);
3247
1.08k
      return true;
3248
1.08k
    }
3249
3250
3
    if (!isLexingRawMode())
3251
2
      Diag(CurPtr-1, diag::null_in_file);
3252
3
    Result.setFlag(Token::LeadingSpace);
3253
3
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3254
0
      return true; // KeepWhitespaceMode
3255
3256
    // We know the lexer hasn't changed, so just try again with this lexer.
3257
    // (We manually eliminate the tail call to avoid recursion.)
3258
3
    goto LexNextToken;
3259
3260
1
  case 26:  // DOS & CP/M EOF: "^Z".
3261
    // If we're in Microsoft extensions mode, treat this as end of file.
3262
1
    if (LangOpts.MicrosoftExt) {
3263
1
      if (!isLexingRawMode())
3264
1
        Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3265
1
      return LexEndOfFile(Result, CurPtr-1);
3266
1
    }
3267
3268
    // If Microsoft extensions are disabled, this is just random garbage.
3269
0
    Kind = tok::unknown;
3270
0
    break;
3271
3272
2.20k
  case '\r':
3273
2.20k
    if (CurPtr[0] == '\n')
3274
2.19k
      (void)getAndAdvanceChar(CurPtr, Result);
3275
2.20k
    LLVM_FALLTHROUGH;
3276
315M
  case '\n':
3277
    // If we are inside a preprocessor directive and we see the end of line,
3278
    // we know we are done with the directive, so return an EOD token.
3279
315M
    if (ParsingPreprocessorDirective) {
3280
      // Done parsing the "line".
3281
105M
      ParsingPreprocessorDirective = false;
3282
3283
      // Restore comment saving mode, in case it was disabled for directive.
3284
105M
      if (PP)
3285
105M
        resetExtendedTokenMode();
3286
3287
      // Since we consumed a newline, we are back at the start of a line.
3288
105M
      IsAtStartOfLine = true;
3289
105M
      IsAtPhysicalStartOfLine = true;
3290
105M
      NewLinePtr = CurPtr - 1;
3291
3292
105M
      Kind = tok::eod;
3293
105M
      break;
3294
105M
    }
3295
3296
    // No leading whitespace seen so far.
3297
210M
    Result.clearFlag(Token::LeadingSpace);
3298
3299
210M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3300
57.7k
      return true; // KeepWhitespaceMode
3301
3302
    // We only saw whitespace, so just try again with this lexer.
3303
    // (We manually eliminate the tail call to avoid recursion.)
3304
210M
    goto LexNextToken;
3305
10.3M
  case ' ':
3306
10.3M
  case '\t':
3307
10.3M
  case '\f':
3308
10.3M
  case '\v':
3309
13.1M
  SkipHorizontalWhitespace:
3310
13.1M
    Result.setFlag(Token::LeadingSpace);
3311
13.1M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3312
715
      return true; // KeepWhitespaceMode
3313
3314
41.3M
  SkipIgnoredUnits:
3315
41.3M
    CurPtr = BufferPtr;
3316
3317
    // If the next token is obviously a // or /* */ comment, skip it efficiently
3318
    // too (without going through the big switch stmt).
3319
41.3M
    if (CurPtr[0] == '/' && 
CurPtr[1] == '/'18.4M
&&
!inKeepCommentMode()18.4M
&&
3320
18.4M
        LangOpts.LineComment &&
3321
18.4M
        (LangOpts.CPlusPlus || 
!LangOpts.TraditionalCPP10.2M
)) {
3322
18.4M
      if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3323
0
        return true; // There is a token to return.
3324
18.4M
      goto SkipIgnoredUnits;
3325
22.9M
    } else if (CurPtr[0] == '/' && 
CurPtr[1] == '*'13.0k
&&
!inKeepCommentMode()10.3k
) {
3326
10.3k
      if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3327
0
        return true; // There is a token to return.
3328
10.3k
      goto SkipIgnoredUnits;
3329
22.9M
    } else if (isHorizontalWhitespace(*CurPtr)) {
3330
2.77M
      goto SkipHorizontalWhitespace;
3331
2.77M
    }
3332
    // We only saw whitespace, so just try again with this lexer.
3333
    // (We manually eliminate the tail call to avoid recursion.)
3334
20.1M
    goto LexNextToken;
3335
3336
  // C99 6.4.4.1: Integer Constants.
3337
  // C99 6.4.4.2: Floating Constants.
3338
83.3M
  case '0': case '1': case '2': case '3': case '4':
3339
83.3M
  case '5': case '6': case '7': case '8': case '9':
3340
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3341
83.3M
    MIOpt.ReadToken();
3342
83.3M
    return LexNumericConstant(Result, CurPtr);
3343
3344
14.5M
  case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3345
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3346
14.5M
    MIOpt.ReadToken();
3347
3348
14.5M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C1110.2M
) {
3349
13.8M
      Char = getCharAndSize(CurPtr, SizeTmp);
3350
3351
      // UTF-16 string literal
3352
13.8M
      if (Char == '"')
3353
194
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3354
194
                                tok::utf16_string_literal);
3355
3356
      // UTF-16 character constant
3357
13.8M
      if (Char == '\'')
3358
132
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3359
132
                               tok::utf16_char_constant);
3360
3361
      // UTF-16 raw string literal
3362
13.8M
      if (Char == 'R' && 
LangOpts.CPlusPlus1130
&&
3363
22
          getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3364
20
        return LexRawStringLiteral(Result,
3365
20
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3366
20
                                           SizeTmp2, Result),
3367
20
                               tok::utf16_string_literal);
3368
3369
13.8M
      if (Char == '8') {
3370
5.34k
        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3371
3372
        // UTF-8 string literal
3373
5.34k
        if (Char2 == '"')
3374
323
          return LexStringLiteral(Result,
3375
323
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3376
323
                                           SizeTmp2, Result),
3377
323
                               tok::utf8_string_literal);
3378
5.02k
        if (Char2 == '\'' && 
LangOpts.CPlusPlus17177
)
3379
165
          return LexCharConstant(
3380
165
              Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3381
165
                                  SizeTmp2, Result),
3382
165
              tok::utf8_char_constant);
3383
3384
4.86k
        if (Char2 == 'R' && 
LangOpts.CPlusPlus1134
) {
3385
26
          unsigned SizeTmp3;
3386
26
          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3387
          // UTF-8 raw string literal
3388
26
          if (Char3 == '"') {
3389
24
            return LexRawStringLiteral(Result,
3390
24
                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3391
24
                                           SizeTmp2, Result),
3392
24
                               SizeTmp3, Result),
3393
24
                   tok::utf8_string_literal);
3394
24
          }
3395
14.5M
        }
3396
4.86k
      }
3397
13.8M
    }
3398
3399
    // treat u like the start of an identifier.
3400
14.5M
    return LexIdentifier(Result, CurPtr);
3401
3402
3.10M
  case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
3403
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3404
3.10M
    MIOpt.ReadToken();
3405
3406
3.10M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C112.33M
) {
3407
3.04M
      Char = getCharAndSize(CurPtr, SizeTmp);
3408
3409
      // UTF-32 string literal
3410
3.04M
      if (Char == '"')
3411
193
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3412
193
                                tok::utf32_string_literal);
3413
3414
      // UTF-32 character constant
3415
3.04M
      if (Char == '\'')
3416
123
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3417
123
                               tok::utf32_char_constant);
3418
3419
      // UTF-32 raw string literal
3420
3.04M
      if (Char == 'R' && 
LangOpts.CPlusPlus1179.6k
&&
3421
7.87k
          getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3422
22
        return LexRawStringLiteral(Result,
3423
22
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3424
22
                                           SizeTmp2, Result),
3425
22
                               tok::utf32_string_literal);
3426
3.10M
    }
3427
3428
    // treat U like the start of an identifier.
3429
3.10M
    return LexIdentifier(Result, CurPtr);
3430
3431
1.09M
  case 'R': // Identifier or C++0x raw string literal
3432
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3433
1.09M
    MIOpt.ReadToken();
3434
3435
1.09M
    if (LangOpts.CPlusPlus11) {
3436
227k
      Char = getCharAndSize(CurPtr, SizeTmp);
3437
3438
227k
      if (Char == '"')
3439
397
        return LexRawStringLiteral(Result,
3440
397
                                   ConsumeChar(CurPtr, SizeTmp, Result),
3441
397
                                   tok::string_literal);
3442
1.09M
    }
3443
3444
    // treat R like the start of an identifier.
3445
1.09M
    return LexIdentifier(Result, CurPtr);
3446
3447
972k
  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3448
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3449
972k
    MIOpt.ReadToken();
3450
972k
    Char = getCharAndSize(CurPtr, SizeTmp);
3451
3452
    // Wide string literal.
3453
972k
    if (Char == '"')
3454
1.70k
      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3455
1.70k
                              tok::wide_string_literal);
3456
3457
    // Wide raw string literal.
3458
970k
    if (LangOpts.CPlusPlus11 && 
Char == 'R'370k
&&
3459
121
        getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3460
18
      return LexRawStringLiteral(Result,
3461
18
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3462
18
                                           SizeTmp2, Result),
3463
18
                               tok::wide_string_literal);
3464
3465
    // Wide character constant.
3466
970k
    if (Char == '\'')
3467
1.14k
      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3468
1.14k
                             tok::wide_char_constant);
3469
    // FALL THROUGH, treating L like the start of an identifier.
3470
969k
    LLVM_FALLTHROUGH;
3471
3472
  // C99 6.4.2: Identifiers.
3473
813M
  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3474
813M
  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3475
813M
  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3476
813M
  case 'V': case 'W': case 'X': case 'Y': case 'Z':
3477
813M
  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3478
813M
  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3479
813M
  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3480
813M
  case 'v': case 'w': case 'x': case 'y': case 'z':
3481
813M
  case '_':
3482
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3483
813M
    MIOpt.ReadToken();
3484
813M
    return LexIdentifier(Result, CurPtr);
3485
3486
33.3k
  case '$':   // $ in identifiers.
3487
33.3k
    if (LangOpts.DollarIdents) {
3488
33.3k
      if (!isLexingRawMode())
3489
31.7k
        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3490
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3491
33.3k
      MIOpt.ReadToken();
3492
33.3k
      return LexIdentifier(Result, CurPtr);
3493
33.3k
    }
3494
3495
13
    Kind = tok::unknown;
3496
13
    break;
3497
3498
  // C99 6.4.4: Character Constants.
3499
1.47M
  case '\'':
3500
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3501
1.47M
    MIOpt.ReadToken();
3502
1.47M
    return LexCharConstant(Result, CurPtr, tok::char_constant);
3503
3504
  // C99 6.4.5: String Literals.
3505
14.0M
  case '"':
3506
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3507
14.0M
    MIOpt.ReadToken();
3508
14.0M
    return LexStringLiteral(Result, CurPtr,
3509
85.6k
                            ParsingFilename ? tok::header_name
3510
13.9M
                                            : tok::string_literal);
3511
3512
  // C99 6.4.6: Punctuators.
3513
299k
  case '?':
3514
299k
    Kind = tok::question;
3515
299k
    break;
3516
3.36M
  case '[':
3517
3.36M
    Kind = tok::l_square;
3518
3.36M
    break;
3519
3.36M
  case ']':
3520
3.36M
    Kind = tok::r_square;
3521
3.36M
    break;
3522
205M
  case '(':
3523
205M
    Kind = tok::l_paren;
3524
205M
    break;
3525
231M
  case ')':
3526
231M
    Kind = tok::r_paren;
3527
231M
    break;
3528
14.7M
  case '{':
3529
14.7M
    Kind = tok::l_brace;
3530
14.7M
    break;
3531
14.7M
  case '}':
3532
14.7M
    Kind = tok::r_brace;
3533
14.7M
    break;
3534
16.4M
  case '.':
3535
16.4M
    Char = getCharAndSize(CurPtr, SizeTmp);
3536
16.4M
    if (Char >= '0' && 
Char <= '9'4.50M
) {
3537
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3538
1.48k
      MIOpt.ReadToken();
3539
3540
1.48k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3541
16.4M
    } else if (LangOpts.CPlusPlus && 
Char == '*'4.98M
) {
3542
19.5k
      Kind = tok::periodstar;
3543
19.5k
      CurPtr += SizeTmp;
3544
16.4M
    } else if (Char == '.' &&
3545
11.6M
               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
3546
11.6M
      Kind = tok::ellipsis;
3547
11.6M
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3548
11.6M
                           SizeTmp2, Result);
3549
4.79M
    } else {
3550
4.79M
      Kind = tok::period;
3551
4.79M
    }
3552
16.4M
    break;
3553
8.95M
  case '&':
3554
8.95M
    Char = getCharAndSize(CurPtr, SizeTmp);
3555
8.95M
    if (Char == '&') {
3556
5.38M
      Kind = tok::ampamp;
3557
5.38M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3558
3.57M
    } else if (Char == '=') {
3559
31.6k
      Kind = tok::ampequal;
3560
31.6k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3561
3.54M
    } else {
3562
3.54M
      Kind = tok::amp;
3563
3.54M
    }
3564
8.95M
    break;
3565
16.6M
  case '*':
3566
16.6M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3567
13.8k
      Kind = tok::starequal;
3568
13.8k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3569
16.6M
    } else {
3570
16.6M
      Kind = tok::star;
3571
16.6M
    }
3572
16.6M
    break;
3573
2.67M
  case '+':
3574
2.67M
    Char = getCharAndSize(CurPtr, SizeTmp);
3575
2.67M
    if (Char == '+') {
3576
687k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3577
687k
      Kind = tok::plusplus;
3578
1.99M
    } else if (Char == '=') {
3579
158k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3580
158k
      Kind = tok::plusequal;
3581
1.83M
    } else {
3582
1.83M
      Kind = tok::plus;
3583
1.83M
    }
3584
2.67M
    break;
3585
6.73M
  case '-':
3586
6.73M
    Char = getCharAndSize(CurPtr, SizeTmp);
3587
6.73M
    if (Char == '-') {      // --
3588
196k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3589
196k
      Kind = tok::minusminus;
3590
6.53M
    } else if (Char == '>' && 
LangOpts.CPlusPlus1.08M
&&
3591
923k
               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
3592
3.82k
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3593
3.82k
                           SizeTmp2, Result);
3594
3.82k
      Kind = tok::arrowstar;
3595
6.53M
    } else if (Char == '>') {   // ->
3596
1.08M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3597
1.08M
      Kind = tok::arrow;
3598
5.45M
    } else if (Char == '=') {   // -=
3599
90.9k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3600
90.9k
      Kind = tok::minusequal;
3601
5.36M
    } else {
3602
5.36M
      Kind = tok::minus;
3603
5.36M
    }
3604
6.73M
    break;
3605
303k
  case '~':
3606
303k
    Kind = tok::tilde;
3607
303k
    break;
3608
6.12M
  case '!':
3609
6.12M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3610
480k
      Kind = tok::exclaimequal;
3611
480k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3612
5.64M
    } else {
3613
5.64M
      Kind = tok::exclaim;
3614
5.64M
    }
3615
6.12M
    break;
3616
57.1M
  case '/':
3617
    // 6.4.9: Comments
3618
57.1M
    Char = getCharAndSize(CurPtr, SizeTmp);
3619
57.1M
    if (Char == '/') {         // Line comment.
3620
      // Even if Line comments are disabled (e.g. in C89 mode), we generally
3621
      // want to lex this as a comment.  There is one problem with this though,
3622
      // that in one particular corner case, this can change the behavior of the
3623
      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
3624
      // this as "foo / bar" and languages with Line comments would lex it as
3625
      // "foo".  Check to see if the character after the second slash is a '*'.
3626
      // If so, we will lex that as a "/" instead of the start of a comment.
3627
      // However, we never do this if we are just preprocessing.
3628
9.83M
      bool TreatAsComment = LangOpts.LineComment &&
3629
9.83M
                            (LangOpts.CPlusPlus || 
!LangOpts.TraditionalCPP2.61M
);
3630
9.83M
      if (!TreatAsComment)
3631
5.80k
        if (!(PP && 
PP->isPreprocessedOutput()5.52k
))
3632
5.71k
          TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3633
3634
9.83M
      if (TreatAsComment) {
3635
9.83M
        if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3636
9.83M
                            TokAtPhysicalStartOfLine))
3637
40.4k
          return true; // There is a token to return.
3638
3639
        // It is common for the tokens immediately after a // comment to be
3640
        // whitespace (indentation for the next line).  Instead of going through
3641
        // the big switch, handle it efficiently now.
3642
9.79M
        goto SkipIgnoredUnits;
3643
9.79M
      }
3644
9.83M
    }
3645
3646
47.2M
    if (Char == '*') {  // /**/ comment.
3647
46.3M
      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3648
46.3M
                           TokAtPhysicalStartOfLine))
3649
2.36k
        return true; // There is a token to return.
3650
3651
      // We only saw whitespace, so just try again with this lexer.
3652
      // (We manually eliminate the tail call to avoid recursion.)
3653
46.3M
      goto LexNextToken;
3654
46.3M
    }
3655
3656
952k
    if (Char == '=') {
3657
8.34k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3658
8.34k
      Kind = tok::slashequal;
3659
943k
    } else {
3660
943k
      Kind = tok::slash;
3661
943k
    }
3662
952k
    break;
3663
60.9k
  case '%':
3664
60.9k
    Char = getCharAndSize(CurPtr, SizeTmp);
3665
60.9k
    if (Char == '=') {
3666
4.28k
      Kind = tok::percentequal;
3667
4.28k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3668
56.6k
    } else if (LangOpts.Digraphs && 
Char == '>'52.6k
) {
3669
10
      Kind = tok::r_brace;                             // '%>' -> '}'
3670
10
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3671
56.6k
    } else if (LangOpts.Digraphs && 
Char == ':'52.6k
) {
3672
15
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3673
15
      Char = getCharAndSize(CurPtr, SizeTmp);
3674
15
      if (Char == '%' && 
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':'0
) {
3675
0
        Kind = tok::hashhash;                          // '%:%:' -> '##'
3676
0
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3677
0
                             SizeTmp2, Result);
3678
15
      } else if (Char == '@' && 
LangOpts.MicrosoftExt0
) {// %:@ -> #@ -> Charize
3679
0
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3680
0
        if (!isLexingRawMode())
3681
0
          Diag(BufferPtr, diag::ext_charize_microsoft);
3682
0
        Kind = tok::hashat;
3683
15
      } else {                                         // '%:' -> '#'
3684
        // We parsed a # character.  If this occurs at the start of the line,
3685
        // it's actually the start of a preprocessing directive.  Callback to
3686
        // the preprocessor to handle it.
3687
        // TODO: -fpreprocessed mode??
3688
15
        if (TokAtPhysicalStartOfLine && !LexingRawMode && 
!Is_PragmaLexer12
)
3689
12
          goto HandleDirective;
3690
3691
3
        Kind = tok::hash;
3692
3
      }
3693
56.6k
    } else {
3694
56.6k
      Kind = tok::percent;
3695
56.6k
    }
3696
60.9k
    break;
3697
17.7M
  case '<':
3698
17.7M
    Char = getCharAndSize(CurPtr, SizeTmp);
3699
17.7M
    if (ParsingFilename) {
3700
2.27M
      return LexAngledStringLiteral(Result, CurPtr);
3701
15.4M
    } else if (Char == '<') {
3702
962k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3703
962k
      if (After == '=') {
3704
4.56k
        Kind = tok::lesslessequal;
3705
4.56k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3706
4.56k
                             SizeTmp2, Result);
3707
957k
      } else if (After == '<' && 
IsStartOfConflictMarker(CurPtr-1)216
) {
3708
        // If this is actually a '<<<<<<<' version control conflict marker,
3709
        // recognize it as such and recover nicely.
3710
2
        goto LexNextToken;
3711
957k
      } else if (After == '<' && 
HandleEndOfConflictMarker(CurPtr-1)214
) {
3712
        // If this is '<<<<' and we're in a Perforce-style conflict marker,
3713
        // ignore it.
3714
0
        goto LexNextToken;
3715
957k
      } else if (LangOpts.CUDA && 
After == '<'114
) {
3716
114
        Kind = tok::lesslessless;
3717
114
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3718
114
                             SizeTmp2, Result);
3719
957k
      } else {
3720
957k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3721
957k
        Kind = tok::lessless;
3722
957k
      }
3723
14.5M
    } else if (Char == '=') {
3724
396k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3725
396k
      if (After == '>') {
3726
1.14k
        if (getLangOpts().CPlusPlus20) {
3727
888
          if (!isLexingRawMode())
3728
826
            Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3729
888
          CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3730
888
                               SizeTmp2, Result);
3731
888
          Kind = tok::spaceship;
3732
888
          break;
3733
888
        }
3734
        // Suggest adding a space between the '<=' and the '>' to avoid a
3735
        // change in semantics if this turns up in C++ <=17 mode.
3736
260
        if (getLangOpts().CPlusPlus && 
!isLexingRawMode()257
) {
3737
12
          Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
3738
12
            << FixItHint::CreateInsertion(
3739
12
                   getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3740
12
        }
3741
260
      }
3742
395k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3743
395k
      Kind = tok::lessequal;
3744
14.1M
    } else if (LangOpts.Digraphs && 
Char == ':'13.9M
) { // '<:' -> '['
3745
82
      if (LangOpts.CPlusPlus11 &&
3746
51
          getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3747
        // C++0x [lex.pptoken]p3:
3748
        //  Otherwise, if the next three characters are <:: and the subsequent
3749
        //  character is neither : nor >, the < is treated as a preprocessor
3750
        //  token by itself and not as the first character of the alternative
3751
        //  token <:.
3752
36
        unsigned SizeTmp3;
3753
36
        char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3754
36
        if (After != ':' && 
After != '>'35
) {
3755
34
          Kind = tok::less;
3756
34
          if (!isLexingRawMode())
3757
31
            Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3758
34
          break;
3759
34
        }
3760
48
      }
3761
3762
48
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3763
48
      Kind = tok::l_square;
3764
14.1M
    } else if (LangOpts.Digraphs && 
Char == '%'13.9M
) { // '<%' -> '{'
3765
9
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3766
9
      Kind = tok::l_brace;
3767
14.1M
    } else if (Char == '#' && /*Not a trigraph*/ 
SizeTmp == 147
&&
3768
45
               lexEditorPlaceholder(Result, CurPtr)) {
3769
43
      return true;
3770
14.1M
    } else {
3771
14.1M
      Kind = tok::less;
3772
14.1M
    }
3773
15.4M
    break;
3774
15.5M
  case '>':
3775
15.5M
    Char = getCharAndSize(CurPtr, SizeTmp);
3776
15.5M
    if (Char == '=') {
3777
1.87M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3778
1.87M
      Kind = tok::greaterequal;
3779
13.7M
    } else if (Char == '>') {
3780
256k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3781
256k
      if (After == '=') {
3782
4.01k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3783
4.01k
                             SizeTmp2, Result);
3784
4.01k
        Kind = tok::greatergreaterequal;
3785
252k
      } else if (After == '>' && 
IsStartOfConflictMarker(CurPtr-1)8.61k
) {
3786
        // If this is actually a '>>>>' conflict marker, recognize it as such
3787
        // and recover nicely.
3788
2
        goto LexNextToken;
3789
252k
      } else if (After == '>' && 
HandleEndOfConflictMarker(CurPtr-1)8.61k
) {
3790
        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3791
0
        goto LexNextToken;
3792
252k
      } else if (LangOpts.CUDA && 
After == '>'128
) {
3793
126
        Kind = tok::greatergreatergreater;
3794
126
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3795
126
                             SizeTmp2, Result);
3796
252k
      } else {
3797
252k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3798
252k
        Kind = tok::greatergreater;
3799
252k
      }
3800
13.4M
    } else {
3801
13.4M
      Kind = tok::greater;
3802
13.4M
    }
3803
15.5M
    break;
3804
230k
  case '^':
3805
230k
    Char = getCharAndSize(CurPtr, SizeTmp);
3806
230k
    if (Char == '=') {
3807
17.7k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3808
17.7k
      Kind = tok::caretequal;
3809
213k
    } else if (LangOpts.OpenCL && 
Char == '^'335
) {
3810
1
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3811
1
      Kind = tok::caretcaret;
3812
213k
    } else {
3813
213k
      Kind = tok::caret;
3814
213k
    }
3815
230k
    break;
3816
4.19M
  case '|':
3817
4.19M
    Char = getCharAndSize(CurPtr, SizeTmp);
3818
4.19M
    if (Char == '=') {
3819
53.4k
      Kind = tok::pipeequal;
3820
53.4k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3821
4.13M
    } else if (Char == '|') {
3822
      // If this is '|||||||' and we're in a conflict marker, ignore it.
3823
3.65M
      if (CurPtr[1] == '|' && 
HandleEndOfConflictMarker(CurPtr-1)19
)
3824
1
        goto LexNextToken;
3825
3.65M
      Kind = tok::pipepipe;
3826
3.65M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3827
485k
    } else {
3828
485k
      Kind = tok::pipe;
3829
485k
    }
3830
4.19M
    break;
3831
11.1M
  case ':':
3832
11.1M
    Char = getCharAndSize(CurPtr, SizeTmp);
3833
11.1M
    if (LangOpts.Digraphs && 
Char == '>'10.9M
) {
3834
21
      Kind = tok::r_square; // ':>' -> ']'
3835
21
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3836
11.1M
    } else if ((LangOpts.CPlusPlus ||
3837
2.70M
                LangOpts.DoubleSquareBracketAttributes) &&
3838
8.46M
               Char == ':') {
3839
6.21M
      Kind = tok::coloncolon;
3840
6.21M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3841
4.95M
    } else {
3842
4.95M
      Kind = tok::colon;
3843
4.95M
    }
3844
11.1M
    break;
3845
58.2M
  case ';':
3846
58.2M
    Kind = tok::semi;
3847
58.2M
    break;
3848
35.1M
  case '=':
3849
35.1M
    Char = getCharAndSize(CurPtr, SizeTmp);
3850
35.1M
    if (Char == '=') {
3851
      // If this is '====' and we're in a conflict marker, ignore it.
3852
1.25M
      if (CurPtr[1] == '=' && 
HandleEndOfConflictMarker(CurPtr-1)38
)
3853
2
        goto LexNextToken;
3854
3855
1.25M
      Kind = tok::equalequal;
3856
1.25M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3857
33.8M
    } else {
3858
33.8M
      Kind = tok::equal;
3859
33.8M
    }
3860
35.1M
    break;
3861
115M
  case ',':
3862
115M
    Kind = tok::comma;
3863
115M
    break;
3864
139M
  case '#':
3865
139M
    Char = getCharAndSize(CurPtr, SizeTmp);
3866
139M
    if (Char == '#') {
3867
388k
      Kind = tok::hashhash;
3868
388k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3869
138M
    } else if (Char == '@' && 
LangOpts.MicrosoftExt3
) { // #@ -> Charize
3870
3
      Kind = tok::hashat;
3871
3
      if (!isLexingRawMode())
3872
3
        Diag(BufferPtr, diag::ext_charize_microsoft);
3873
3
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3874
138M
    } else {
3875
      // We parsed a # character.  If this occurs at the start of the line,
3876
      // it's actually the start of a preprocessing directive.  Callback to
3877
      // the preprocessor to handle it.
3878
      // TODO: -fpreprocessed mode??
3879
138M
      if (TokAtPhysicalStartOfLine && 
!LexingRawMode138M
&&
!Is_PragmaLexer66.1M
)
3880
66.1M
        goto HandleDirective;
3881
3882
72.6M
      Kind = tok::hash;
3883
72.6M
    }
3884
73.0M
    break;
3885
3886
1.81M
  case '@':
3887
    // Objective C support.
3888
1.81M
    if (CurPtr[-1] == '@' && LangOpts.ObjC)
3889
1.81M
      Kind = tok::at;
3890
1.47k
    else
3891
1.47k
      Kind = tok::unknown;
3892
1.81M
    break;
3893
3894
  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3895
1.16k
  case '\\':
3896
1.16k
    if (!LangOpts.AsmPreprocessor) {
3897
1.15k
      if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3898
93
        if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3899
0
          if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3900
0
            return true; // KeepWhitespaceMode
3901
3902
          // We only saw whitespace, so just try again with this lexer.
3903
          // (We manually eliminate the tail call to avoid recursion.)
3904
0
          goto LexNextToken;
3905
0
        }
3906
3907
93
        return LexUnicode(Result, CodePoint, CurPtr);
3908
93
      }
3909
1.15k
    }
3910
3911
1.07k
    Kind = tok::unknown;
3912
1.07k
    break;
3913
3914
332
  default: {
3915
332
    if (isASCII(Char)) {
3916
165
      Kind = tok::unknown;
3917
165
      break;
3918
165
    }
3919
3920
167
    llvm::UTF32 CodePoint;
3921
3922
    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3923
    // an escaped newline.
3924
167
    --CurPtr;
3925
167
    llvm::ConversionResult Status =
3926
167
        llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3927
167
                                  (const llvm::UTF8 *)BufferEnd,
3928
167
                                  &CodePoint,
3929
167
                                  llvm::strictConversion);
3930
167
    if (Status == llvm::conversionOK) {
3931
101
      if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3932
6
        if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3933
0
          return true; // KeepWhitespaceMode
3934
3935
        // We only saw whitespace, so just try again with this lexer.
3936
        // (We manually eliminate the tail call to avoid recursion.)
3937
6
        goto LexNextToken;
3938
6
      }
3939
95
      return LexUnicode(Result, CodePoint, CurPtr);
3940
95
    }
3941
3942
66
    if (isLexingRawMode() || 
ParsingPreprocessorDirective4
||
3943
65
        
PP->isPreprocessedOutput()2
) {
3944
65
      ++CurPtr;
3945
65
      Kind = tok::unknown;
3946
65
      break;
3947
65
    }
3948
3949
    // Non-ASCII characters tend to creep into source code unintentionally.
3950
    // Instead of letting the parser complain about the unknown token,
3951
    // just diagnose the invalid UTF-8, then drop the character.
3952
1
    Diag(CurPtr, diag::err_invalid_utf8);
3953
3954
1
    BufferPtr = CurPtr+1;
3955
    // We're pretending the character didn't exist, so just try again with
3956
    // this lexer.
3957
    // (We manually eliminate the tail call to avoid recursion.)
3958
1
    goto LexNextToken;
3959
1
  }
3960
967M
  }
3961
3962
  // Notify MIOpt that we read a non-whitespace/non-comment token.
3963
967M
  MIOpt.ReadToken();
3964
3965
  // Update the location of token as well as BufferPtr.
3966
967M
  FormTokenWithChars(Result, CurPtr, Kind);
3967
967M
  return true;
3968
3969
66.1M
HandleDirective:
3970
  // We parsed a # character and it's the start of a preprocessing directive.
3971
3972
66.1M
  FormTokenWithChars(Result, CurPtr, tok::hash);
3973
66.1M
  PP->HandleDirective(Result);
3974
3975
66.1M
  if (PP->hadModuleLoaderFatalFailure()) {
3976
    // With a fatal failure in the module loader, we abort parsing.
3977
2
    assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
3978
2
    return true;
3979
2
  }
3980
3981
  // We parsed the directive; lex a token with the new state.
3982
66.1M
  return false;
3983
66.1M
}