Coverage Report

Created: 2020-02-18 08:44

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/Lexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
//  This file implements the Lexer and Token interfaces.
10
//
11
//===----------------------------------------------------------------------===//
12
13
#include "clang/Lex/Lexer.h"
14
#include "UnicodeCharSets.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/IdentifierTable.h"
17
#include "clang/Basic/LangOptions.h"
18
#include "clang/Basic/SourceLocation.h"
19
#include "clang/Basic/SourceManager.h"
20
#include "clang/Basic/TokenKinds.h"
21
#include "clang/Lex/LexDiagnostic.h"
22
#include "clang/Lex/LiteralSupport.h"
23
#include "clang/Lex/MultipleIncludeOpt.h"
24
#include "clang/Lex/Preprocessor.h"
25
#include "clang/Lex/PreprocessorOptions.h"
26
#include "clang/Lex/Token.h"
27
#include "clang/Basic/Diagnostic.h"
28
#include "clang/Basic/LLVM.h"
29
#include "clang/Basic/TokenKinds.h"
30
#include "llvm/ADT/None.h"
31
#include "llvm/ADT/Optional.h"
32
#include "llvm/ADT/StringExtras.h"
33
#include "llvm/ADT/StringSwitch.h"
34
#include "llvm/ADT/StringRef.h"
35
#include "llvm/Support/Compiler.h"
36
#include "llvm/Support/ConvertUTF.h"
37
#include "llvm/Support/MathExtras.h"
38
#include "llvm/Support/MemoryBuffer.h"
39
#include "llvm/Support/NativeFormatting.h"
40
#include "llvm/Support/UnicodeCharRanges.h"
41
#include <algorithm>
42
#include <cassert>
43
#include <cstddef>
44
#include <cstdint>
45
#include <cstring>
46
#include <string>
47
#include <tuple>
48
#include <utility>
49
50
using namespace clang;
51
52
//===----------------------------------------------------------------------===//
53
// Token Class Implementation
54
//===----------------------------------------------------------------------===//
55
56
/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
57
502k
bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
58
502k
  if (isAnnotation())
59
2
    return false;
60
502k
  if (IdentifierInfo *II = getIdentifierInfo())
61
493k
    return II->getObjCKeywordID() == objcKey;
62
9.10k
  return false;
63
9.10k
}
64
65
/// getObjCKeywordID - Return the ObjC keyword kind.
66
1.08M
tok::ObjCKeywordKind Token::getObjCKeywordID() const {
67
1.08M
  if (isAnnotation())
68
1
    return tok::objc_not_keyword;
69
1.08M
  IdentifierInfo *specId = getIdentifierInfo();
70
1.08M
  return specId ? 
specId->getObjCKeywordID()987k
:
tok::objc_not_keyword96.4k
;
71
1.08M
}
72
73
//===----------------------------------------------------------------------===//
74
// Lexer Class Implementation
75
//===----------------------------------------------------------------------===//
76
77
0
void Lexer::anchor() {}
78
79
void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
80
31.6M
                      const char *BufEnd) {
81
31.6M
  BufferStart = BufStart;
82
31.6M
  BufferPtr = BufPtr;
83
31.6M
  BufferEnd = BufEnd;
84
31.6M
85
31.6M
  assert(BufEnd[0] == 0 &&
86
31.6M
         "We assume that the input buffer has a null character at the end"
87
31.6M
         " to simplify lexing!");
88
31.6M
89
31.6M
  // Check whether we have a BOM in the beginning of the buffer. If yes - act
90
31.6M
  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
91
31.6M
  // skip the UTF-8 BOM if it's present.
92
31.6M
  if (BufferStart == BufferPtr) {
93
1.63M
    // Determine the size of the BOM.
94
1.63M
    StringRef Buf(BufferStart, BufferEnd - BufferStart);
95
1.63M
    size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
96
1.63M
      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
97
1.63M
      .Default(0);
98
1.63M
99
1.63M
    // Skip the BOM.
100
1.63M
    BufferPtr += BOMLength;
101
1.63M
  }
102
31.6M
103
31.6M
  Is_PragmaLexer = false;
104
31.6M
  CurrentConflictMarkerState = CMK_None;
105
31.6M
106
31.6M
  // Start of the file is a start of line.
107
31.6M
  IsAtStartOfLine = true;
108
31.6M
  IsAtPhysicalStartOfLine = true;
109
31.6M
110
31.6M
  HasLeadingSpace = false;
111
31.6M
  HasLeadingEmptyMacro = false;
112
31.6M
113
31.6M
  // We are not after parsing a #.
114
31.6M
  ParsingPreprocessorDirective = false;
115
31.6M
116
31.6M
  // We are not after parsing #include.
117
31.6M
  ParsingFilename = false;
118
31.6M
119
31.6M
  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
120
31.6M
  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
121
31.6M
  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
122
31.6M
  // or otherwise skipping over tokens.
123
31.6M
  LexingRawMode = false;
124
31.6M
125
31.6M
  // Default to not keeping comments.
126
31.6M
  ExtendedTokenMode = 0;
127
31.6M
}
128
129
/// Lexer constructor - Create a new lexer object for the specified buffer
130
/// with the specified preprocessor managing the lexing process.  This lexer
131
/// assumes that the associated file buffer and Preprocessor objects will
132
/// outlive it, so it doesn't take ownership of either of them.
133
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
134
    : PreprocessorLexer(&PP, FID),
135
      FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
136
1.52M
      LangOpts(PP.getLangOpts()) {
137
1.52M
  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
138
1.52M
            InputFile->getBufferEnd());
139
1.52M
140
1.52M
  resetExtendedTokenMode();
141
1.52M
}
142
143
/// Lexer constructor - Create a new raw lexer object.  This object is only
144
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
145
/// range will outlive it, so it doesn't take ownership of it.
146
Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
147
             const char *BufStart, const char *BufPtr, const char *BufEnd)
148
30.1M
    : FileLoc(fileloc), LangOpts(langOpts) {
149
30.1M
  InitLexer(BufStart, BufPtr, BufEnd);
150
30.1M
151
30.1M
  // We *are* in raw mode.
152
30.1M
  LexingRawMode = true;
153
30.1M
}
154
155
/// Lexer constructor - Create a new raw lexer object.  This object is only
156
/// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
157
/// range will outlive it, so it doesn't take ownership of it.
158
Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
159
             const SourceManager &SM, const LangOptions &langOpts)
160
    : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(),
161
54.7k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
Unexecuted instantiation: clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
clang::Lexer::Lexer(clang::FileID, llvm::MemoryBuffer const*, clang::SourceManager const&, clang::LangOptions const&)
Line
Count
Source
161
54.7k
            FromFile->getBufferStart(), FromFile->getBufferEnd()) {}
162
163
201M
void Lexer::resetExtendedTokenMode() {
164
201M
  assert(PP && "Cannot reset token mode without a preprocessor");
165
201M
  if (LangOpts.TraditionalCPP)
166
1.05k
    SetKeepWhitespaceMode(true);
167
201M
  else
168
201M
    SetCommentRetentionState(PP->getCommentRetentionState());
169
201M
}
170
171
/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
172
/// _Pragma expansion.  This has a variety of magic semantics that this method
173
/// sets up.  It returns a new'd Lexer that must be delete'd when done.
174
///
175
/// On entrance to this routine, TokStartLoc is a macro location which has a
176
/// spelling loc that indicates the bytes to be lexed for the token and an
177
/// expansion location that indicates where all lexed tokens should be
178
/// "expanded from".
179
///
180
/// TODO: It would really be nice to make _Pragma just be a wrapper around a
181
/// normal lexer that remaps tokens as they fly by.  This would require making
182
/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
183
/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
184
/// out of the critical path of the lexer!
185
///
186
Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
187
                                 SourceLocation ExpansionLocStart,
188
                                 SourceLocation ExpansionLocEnd,
189
565k
                                 unsigned TokLen, Preprocessor &PP) {
190
565k
  SourceManager &SM = PP.getSourceManager();
191
565k
192
565k
  // Create the lexer as if we were going to lex the file normally.
193
565k
  FileID SpellingFID = SM.getFileID(SpellingLoc);
194
565k
  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
195
565k
  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
196
565k
197
565k
  // Now that the lexer is created, change the start/end locations so that we
198
565k
  // just lex the subsection of the file that we want.  This is lexing from a
199
565k
  // scratch buffer.
200
565k
  const char *StrData = SM.getCharacterData(SpellingLoc);
201
565k
202
565k
  L->BufferPtr = StrData;
203
565k
  L->BufferEnd = StrData+TokLen;
204
565k
  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
205
565k
206
565k
  // Set the SourceLocation with the remapping information.  This ensures that
207
565k
  // GetMappedTokenLoc will remap the tokens as they are lexed.
208
565k
  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
209
565k
                                     ExpansionLocStart,
210
565k
                                     ExpansionLocEnd, TokLen);
211
565k
212
565k
  // Ensure that the lexer thinks it is inside a directive, so that end \n will
213
565k
  // return an EOD token.
214
565k
  L->ParsingPreprocessorDirective = true;
215
565k
216
565k
  // This lexer really is for _Pragma.
217
565k
  L->Is_PragmaLexer = true;
218
565k
  return L;
219
565k
}
220
221
20
bool Lexer::skipOver(unsigned NumBytes) {
222
20
  IsAtPhysicalStartOfLine = true;
223
20
  IsAtStartOfLine = true;
224
20
  if ((BufferPtr + NumBytes) > BufferEnd)
225
0
    return true;
226
20
  BufferPtr += NumBytes;
227
20
  return false;
228
20
}
229
230
469
template <typename T> static void StringifyImpl(T &Str, char Quote) {
231
469
  typename T::size_type i = 0, e = Str.size();
232
41.0k
  while (i < e) {
233
40.5k
    if (Str[i] == '\\' || 
Str[i] == Quote40.5k
) {
234
218
      Str.insert(Str.begin() + i, '\\');
235
218
      i += 2;
236
218
      ++e;
237
40.3k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'40.3k
) {
238
17
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
239
17
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'13
) &&
240
17
          
Str[i] != Str[i + 1]4
) {
241
0
        Str[i] = '\\';
242
0
        Str[i + 1] = 'n';
243
17
      } else {
244
17
        // Replace '\n' and '\r' to '\\' followed by 'n'.
245
17
        Str[i] = '\\';
246
17
        Str.insert(Str.begin() + i + 1, 'n');
247
17
        ++e;
248
17
      }
249
17
      i += 2;
250
17
    } else
251
40.3k
      ++i;
252
40.5k
  }
253
469
}
Lexer.cpp:void StringifyImpl<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >&, char)
Line
Count
Source
230
106
template <typename T> static void StringifyImpl(T &Str, char Quote) {
231
106
  typename T::size_type i = 0, e = Str.size();
232
2.68k
  while (i < e) {
233
2.57k
    if (Str[i] == '\\' || 
Str[i] == Quote2.56k
) {
234
207
      Str.insert(Str.begin() + i, '\\');
235
207
      i += 2;
236
207
      ++e;
237
2.37k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'2.36k
) {
238
9
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
239
9
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'7
) &&
240
9
          
Str[i] != Str[i + 1]2
) {
241
0
        Str[i] = '\\';
242
0
        Str[i + 1] = 'n';
243
9
      } else {
244
9
        // Replace '\n' and '\r' to '\\' followed by 'n'.
245
9
        Str[i] = '\\';
246
9
        Str.insert(Str.begin() + i + 1, 'n');
247
9
        ++e;
248
9
      }
249
9
      i += 2;
250
9
    } else
251
2.36k
      ++i;
252
2.57k
  }
253
106
}
Lexer.cpp:void StringifyImpl<llvm::SmallVectorImpl<char> >(llvm::SmallVectorImpl<char>&, char)
Line
Count
Source
230
363
template <typename T> static void StringifyImpl(T &Str, char Quote) {
231
363
  typename T::size_type i = 0, e = Str.size();
232
38.3k
  while (i < e) {
233
37.9k
    if (Str[i] == '\\' || 
Str[i] == Quote37.9k
) {
234
11
      Str.insert(Str.begin() + i, '\\');
235
11
      i += 2;
236
11
      ++e;
237
37.9k
    } else if (Str[i] == '\n' || 
Str[i] == '\r'37.9k
) {
238
8
      // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
239
8
      if ((i < e - 1) && (Str[i + 1] == '\n' || 
Str[i + 1] == '\r'6
) &&
240
8
          
Str[i] != Str[i + 1]2
) {
241
0
        Str[i] = '\\';
242
0
        Str[i + 1] = 'n';
243
8
      } else {
244
8
        // Replace '\n' and '\r' to '\\' followed by 'n'.
245
8
        Str[i] = '\\';
246
8
        Str.insert(Str.begin() + i + 1, 'n');
247
8
        ++e;
248
8
      }
249
8
      i += 2;
250
8
    } else
251
37.9k
      ++i;
252
37.9k
  }
253
363
}
254
255
106
std::string Lexer::Stringify(StringRef Str, bool Charify) {
256
106
  std::string Result = std::string(Str);
257
106
  char Quote = Charify ? 
'\''0
: '"';
258
106
  StringifyImpl(Result, Quote);
259
106
  return Result;
260
106
}
261
262
363
void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
263
264
//===----------------------------------------------------------------------===//
265
// Token Spelling
266
//===----------------------------------------------------------------------===//
267
268
/// Slow case of getSpelling. Extract the characters comprising the
269
/// spelling of this token from the provided input buffer.
270
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
271
9.22k
                              const LangOptions &LangOpts, char *Spelling) {
272
9.22k
  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
273
9.22k
274
9.22k
  size_t Length = 0;
275
9.22k
  const char *BufEnd = BufPtr + Tok.getLength();
276
9.22k
277
9.22k
  if (tok::isStringLiteral(Tok.getKind())) {
278
303
    // Munch the encoding-prefix and opening double-quote.
279
326
    while (BufPtr < BufEnd) {
280
326
      unsigned Size;
281
326
      Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
282
326
      BufPtr += Size;
283
326
284
326
      if (Spelling[Length - 1] == '"')
285
303
        break;
286
326
    }
287
303
288
303
    // Raw string literals need special handling; trigraph expansion and line
289
303
    // splicing do not occur within their d-char-sequence nor within their
290
303
    // r-char-sequence.
291
303
    if (Length >= 2 &&
292
303
        
Spelling[Length - 2] == 'R'11
&&
Spelling[Length - 1] == '"'11
) {
293
11
      // Search backwards from the end of the token to find the matching closing
294
11
      // quote.
295
11
      const char *RawEnd = BufEnd;
296
20
      do --RawEnd; while (*RawEnd != '"');
297
11
      size_t RawLength = RawEnd - BufPtr + 1;
298
11
299
11
      // Everything between the quotes is included verbatim in the spelling.
300
11
      memcpy(Spelling + Length, BufPtr, RawLength);
301
11
      Length += RawLength;
302
11
      BufPtr += RawLength;
303
11
304
11
      // The rest of the token is lexed normally.
305
11
    }
306
303
  }
307
9.22k
308
98.9k
  while (BufPtr < BufEnd) {
309
89.7k
    unsigned Size;
310
89.7k
    Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
311
89.7k
    BufPtr += Size;
312
89.7k
  }
313
9.22k
314
9.22k
  assert(Length < Tok.getLength() &&
315
9.22k
         "NeedsCleaning flag set on token that didn't need cleaning!");
316
9.22k
  return Length;
317
9.22k
}
318
319
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
320
/// token are the characters used to represent the token in the source file
321
/// after trigraph expansion and escaped-newline folding.  In particular, this
322
/// wants to get the true, uncanonicalized, spelling of things like digraphs
323
/// UCNs, etc.
324
StringRef Lexer::getSpelling(SourceLocation loc,
325
                             SmallVectorImpl<char> &buffer,
326
                             const SourceManager &SM,
327
                             const LangOptions &options,
328
300
                             bool *invalid) {
329
300
  // Break down the source location.
330
300
  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
331
300
332
300
  // Try to the load the file buffer.
333
300
  bool invalidTemp = false;
334
300
  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
335
300
  if (invalidTemp) {
336
0
    if (invalid) *invalid = true;
337
0
    return {};
338
0
  }
339
300
340
300
  const char *tokenBegin = file.data() + locInfo.second;
341
300
342
300
  // Lex from the start of the given location.
343
300
  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
344
300
              file.begin(), tokenBegin, file.end());
345
300
  Token token;
346
300
  lexer.LexFromRawLexer(token);
347
300
348
300
  unsigned length = token.getLength();
349
300
350
300
  // Common case:  no need for cleaning.
351
300
  if (!token.needsCleaning())
352
299
    return StringRef(tokenBegin, length);
353
1
354
1
  // Hard case, we need to relex the characters into the string.
355
1
  buffer.resize(length);
356
1
  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
357
1
  return StringRef(buffer.data(), buffer.size());
358
1
}
359
360
/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
361
/// token are the characters used to represent the token in the source file
362
/// after trigraph expansion and escaped-newline folding.  In particular, this
363
/// wants to get the true, uncanonicalized, spelling of things like digraphs
364
/// UCNs, etc.
365
std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
366
2.07M
                               const LangOptions &LangOpts, bool *Invalid) {
367
2.07M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
368
2.07M
369
2.07M
  bool CharDataInvalid = false;
370
2.07M
  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
371
2.07M
                                                    &CharDataInvalid);
372
2.07M
  if (Invalid)
373
102
    *Invalid = CharDataInvalid;
374
2.07M
  if (CharDataInvalid)
375
0
    return {};
376
2.07M
377
2.07M
  // If this token contains nothing interesting, return it directly.
378
2.07M
  if (!Tok.needsCleaning())
379
2.07M
    return std::string(TokStart, TokStart + Tok.getLength());
380
5
381
5
  std::string Result;
382
5
  Result.resize(Tok.getLength());
383
5
  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
384
5
  return Result;
385
5
}
386
387
/// getSpelling - This method is used to get the spelling of a token into a
388
/// preallocated buffer, instead of as an std::string.  The caller is required
389
/// to allocate enough space for the token, which is guaranteed to be at least
390
/// Tok.getLength() bytes long.  The actual length of the token is returned.
391
///
392
/// Note that this method may do two possible things: it may either fill in
393
/// the buffer specified with characters, or it may *change the input pointer*
394
/// to point to a constant buffer with the data already in it (avoiding a
395
/// copy).  The caller is not allowed to modify the returned buffer pointer
396
/// if an internal buffer is returned.
397
unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
398
                            const SourceManager &SourceMgr,
399
52.8M
                            const LangOptions &LangOpts, bool *Invalid) {
400
52.8M
  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
401
52.8M
402
52.8M
  const char *TokStart = nullptr;
403
52.8M
  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
404
52.8M
  if (Tok.is(tok::raw_identifier))
405
2.80M
    TokStart = Tok.getRawIdentifier().data();
406
50.0M
  else if (!Tok.hasUCN()) {
407
50.0M
    if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
408
25.2M
      // Just return the string from the identifier table, which is very quick.
409
25.2M
      Buffer = II->getNameStart();
410
25.2M
      return II->getLength();
411
25.2M
    }
412
27.5M
  }
413
27.5M
414
27.5M
  // NOTE: this can be checked even after testing for an IdentifierInfo.
415
27.5M
  if (Tok.isLiteral())
416
24.4M
    TokStart = Tok.getLiteralData();
417
27.5M
418
27.5M
  if (!TokStart) {
419
591k
    // Compute the start of the token in the input lexer buffer.
420
591k
    bool CharDataInvalid = false;
421
591k
    TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
422
591k
    if (Invalid)
423
246k
      *Invalid = CharDataInvalid;
424
591k
    if (CharDataInvalid) {
425
0
      Buffer = "";
426
0
      return 0;
427
0
    }
428
27.5M
  }
429
27.5M
430
27.5M
  // If this token contains nothing interesting, return it directly.
431
27.5M
  if (!Tok.needsCleaning()) {
432
27.5M
    Buffer = TokStart;
433
27.5M
    return Tok.getLength();
434
27.5M
  }
435
9.22k
436
9.22k
  // Otherwise, hard case, relex the characters into the string.
437
9.22k
  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
438
9.22k
}
439
440
/// MeasureTokenLength - Relex the token at the specified location and return
441
/// its length in bytes in the input file.  If the token needs cleaning (e.g.
442
/// includes a trigraph or an escaped newline) then this count includes bytes
443
/// that are part of that.
444
unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
445
                                   const SourceManager &SM,
446
29.2M
                                   const LangOptions &LangOpts) {
447
29.2M
  Token TheTok;
448
29.2M
  if (getRawToken(Loc, TheTok, SM, LangOpts))
449
1.44k
    return 0;
450
29.2M
  return TheTok.getLength();
451
29.2M
}
452
453
/// Relex the token at the specified location.
454
/// \returns true if there was a failure, false on success.
455
bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
456
                        const SourceManager &SM,
457
                        const LangOptions &LangOpts,
458
29.2M
                        bool IgnoreWhiteSpace) {
459
29.2M
  // TODO: this could be special cased for common tokens like identifiers, ')',
460
29.2M
  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
461
29.2M
  // all obviously single-char tokens.  This could use
462
29.2M
  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
463
29.2M
  // something.
464
29.2M
465
29.2M
  // If this comes from a macro expansion, we really do want the macro name, not
466
29.2M
  // the token this macro expanded to.
467
29.2M
  Loc = SM.getExpansionLoc(Loc);
468
29.2M
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
469
29.2M
  bool Invalid = false;
470
29.2M
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
471
29.2M
  if (Invalid)
472
4
    return true;
473
29.2M
474
29.2M
  const char *StrData = Buffer.data()+LocInfo.second;
475
29.2M
476
29.2M
  if (!IgnoreWhiteSpace && 
isWhitespace(StrData[0])29.2M
)
477
1.43k
    return true;
478
29.2M
479
29.2M
  // Create a lexer starting at the beginning of this token.
480
29.2M
  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
481
29.2M
                 Buffer.begin(), StrData, Buffer.end());
482
29.2M
  TheLexer.SetCommentRetentionState(true);
483
29.2M
  TheLexer.LexFromRawLexer(Result);
484
29.2M
  return false;
485
29.2M
}
486
487
/// Returns the pointer that points to the beginning of line that contains
488
/// the given offset, or null if the offset if invalid.
489
10.6k
static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
490
10.6k
  const char *BufStart = Buffer.data();
491
10.6k
  if (Offset >= Buffer.size())
492
6
    return nullptr;
493
10.6k
494
10.6k
  const char *LexStart = BufStart + Offset;
495
283k
  for (; LexStart != BufStart; 
--LexStart272k
) {
496
283k
    if (isVerticalWhitespace(LexStart[0]) &&
497
283k
        
!Lexer::isNewLineEscaped(BufStart, LexStart)10.3k
) {
498
10.3k
      // LexStart should point at first character of logical line.
499
10.3k
      ++LexStart;
500
10.3k
      break;
501
10.3k
    }
502
283k
  }
503
10.6k
  return LexStart;
504
10.6k
}
505
506
static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
507
                                              const SourceManager &SM,
508
10.4k
                                              const LangOptions &LangOpts) {
509
10.4k
  assert(Loc.isFileID());
510
10.4k
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
511
10.4k
  if (LocInfo.first.isInvalid())
512
0
    return Loc;
513
10.4k
514
10.4k
  bool Invalid = false;
515
10.4k
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
516
10.4k
  if (Invalid)
517
0
    return Loc;
518
10.4k
519
10.4k
  // Back up from the current location until we hit the beginning of a line
520
10.4k
  // (or the buffer). We'll relex from that point.
521
10.4k
  const char *StrData = Buffer.data() + LocInfo.second;
522
10.4k
  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
523
10.4k
  if (!LexStart || 
LexStart == StrData10.4k
)
524
254
    return Loc;
525
10.2k
526
10.2k
  // Create a lexer starting at the beginning of this token.
527
10.2k
  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
528
10.2k
  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
529
10.2k
                 Buffer.end());
530
10.2k
  TheLexer.SetCommentRetentionState(true);
531
10.2k
532
10.2k
  // Lex tokens until we find the token that contains the source location.
533
10.2k
  Token TheTok;
534
16.4k
  do {
535
16.4k
    TheLexer.LexFromRawLexer(TheTok);
536
16.4k
537
16.4k
    if (TheLexer.getBufferLocation() > StrData) {
538
10.2k
      // Lexing this token has taken the lexer past the source location we're
539
10.2k
      // looking for. If the current token encompasses our source location,
540
10.2k
      // return the beginning of that token.
541
10.2k
      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
542
9.74k
        return TheTok.getLocation();
543
492
544
492
      // We ended up skipping over the source location entirely, which means
545
492
      // that it points into whitespace. We're done here.
546
492
      break;
547
492
    }
548
16.4k
  } while (
TheTok.getKind() != tok::eof6.25k
);
549
10.2k
550
10.2k
  // We've passed our source location; just return the original source location.
551
10.2k
  
return Loc492
;
552
10.2k
}
553
554
SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
555
                                          const SourceManager &SM,
556
10.4k
                                          const LangOptions &LangOpts) {
557
10.4k
  if (Loc.isFileID())
558
10.4k
    return getBeginningOfFileToken(Loc, SM, LangOpts);
559
20
560
20
  if (!SM.isMacroArgExpansion(Loc))
561
0
    return Loc;
562
20
563
20
  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
564
20
  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
565
20
  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
566
20
  std::pair<FileID, unsigned> BeginFileLocInfo =
567
20
      SM.getDecomposedLoc(BeginFileLoc);
568
20
  assert(FileLocInfo.first == BeginFileLocInfo.first &&
569
20
         FileLocInfo.second >= BeginFileLocInfo.second);
570
20
  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
571
20
}
572
573
namespace {
574
575
enum PreambleDirectiveKind {
576
  PDK_Skipped,
577
  PDK_Unknown
578
};
579
580
} // namespace
581
582
PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
583
                                      const LangOptions &LangOpts,
584
501
                                      unsigned MaxLines) {
585
501
  // Create a lexer starting at the beginning of the file. Note that we use a
586
501
  // "fake" file source location at offset 1 so that the lexer will track our
587
501
  // position within the file.
588
501
  const unsigned StartOffset = 1;
589
501
  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
590
501
  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
591
501
                 Buffer.end());
592
501
  TheLexer.SetCommentRetentionState(true);
593
501
594
501
  bool InPreprocessorDirective = false;
595
501
  Token TheTok;
596
501
  SourceLocation ActiveCommentLoc;
597
501
598
501
  unsigned MaxLineOffset = 0;
599
501
  if (MaxLines) {
600
84
    const char *CurPtr = Buffer.begin();
601
84
    unsigned CurLine = 0;
602
12.3k
    while (CurPtr != Buffer.end()) {
603
12.3k
      char ch = *CurPtr++;
604
12.3k
      if (ch == '\n') {
605
642
        ++CurLine;
606
642
        if (CurLine == MaxLines)
607
83
          break;
608
642
      }
609
12.3k
    }
610
84
    if (CurPtr != Buffer.end())
611
78
      MaxLineOffset = CurPtr - Buffer.begin();
612
84
  }
613
501
614
3.89k
  do {
615
3.89k
    TheLexer.LexFromRawLexer(TheTok);
616
3.89k
617
3.89k
    if (InPreprocessorDirective) {
618
2.82k
      // If we've hit the end of the file, we're done.
619
2.82k
      if (TheTok.getKind() == tok::eof) {
620
14
        break;
621
14
      }
622
2.81k
623
2.81k
      // If we haven't hit the end of the preprocessor directive, skip this
624
2.81k
      // token.
625
2.81k
      if (!TheTok.isAtStartOfLine())
626
1.89k
        continue;
627
915
628
915
      // We've passed the end of the preprocessor directive, and will look
629
915
      // at this token again below.
630
915
      InPreprocessorDirective = false;
631
915
    }
632
3.89k
633
3.89k
    // Keep track of the # of lines in the preamble.
634
3.89k
    
if (1.98k
TheTok.isAtStartOfLine()1.98k
) {
635
1.96k
      unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
636
1.96k
637
1.96k
      // If we were asked to limit the number of lines in the preamble,
638
1.96k
      // and we're about to exceed that limit, we're done.
639
1.96k
      if (MaxLineOffset && 
TokOffset >= MaxLineOffset360
)
640
18
        break;
641
1.96k
    }
642
1.96k
643
1.96k
    // Comments are okay; skip over them.
644
1.96k
    if (TheTok.getKind() == tok::comment) {
645
569
      if (ActiveCommentLoc.isInvalid())
646
223
        ActiveCommentLoc = TheTok.getLocation();
647
569
      continue;
648
569
    }
649
1.39k
650
1.39k
    if (TheTok.isAtStartOfLine() && 
TheTok.getKind() == tok::hash1.37k
) {
651
929
      // This is the start of a preprocessor directive.
652
929
      Token HashTok = TheTok;
653
929
      InPreprocessorDirective = true;
654
929
      ActiveCommentLoc = SourceLocation();
655
929
656
929
      // Figure out which directive this is. Since we're lexing raw tokens,
657
929
      // we don't have an identifier table available. Instead, just look at
658
929
      // the raw identifier to recognize and categorize preprocessor directives.
659
929
      TheLexer.LexFromRawLexer(TheTok);
660
929
      if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
661
929
        StringRef Keyword = TheTok.getRawIdentifier();
662
929
        PreambleDirectiveKind PDK
663
929
          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
664
929
              .Case("include", PDK_Skipped)
665
929
              .Case("__include_macros", PDK_Skipped)
666
929
              .Case("define", PDK_Skipped)
667
929
              .Case("undef", PDK_Skipped)
668
929
              .Case("line", PDK_Skipped)
669
929
              .Case("error", PDK_Skipped)
670
929
              .Case("pragma", PDK_Skipped)
671
929
              .Case("import", PDK_Skipped)
672
929
              .Case("include_next", PDK_Skipped)
673
929
              .Case("warning", PDK_Skipped)
674
929
              .Case("ident", PDK_Skipped)
675
929
              .Case("sccs", PDK_Skipped)
676
929
              .Case("assert", PDK_Skipped)
677
929
              .Case("unassert", PDK_Skipped)
678
929
              .Case("if", PDK_Skipped)
679
929
              .Case("ifdef", PDK_Skipped)
680
929
              .Case("ifndef", PDK_Skipped)
681
929
              .Case("elif", PDK_Skipped)
682
929
              .Case("else", PDK_Skipped)
683
929
              .Case("endif", PDK_Skipped)
684
929
              .Default(PDK_Unknown);
685
929
686
929
        switch (PDK) {
687
929
        case PDK_Skipped:
688
929
          continue;
689
0
690
0
        case PDK_Unknown:
691
0
          // We don't know what this directive is; stop at the '#'.
692
0
          break;
693
0
        }
694
0
      }
695
0
696
0
      // We only end up here if we didn't recognize the preprocessor
697
0
      // directive or it was one that can't occur in the preamble at this
698
0
      // point. Roll back the current token to the location of the '#'.
699
0
      TheTok = HashTok;
700
0
    }
701
1.39k
702
1.39k
    // We hit a token that we don't recognize as being in the
703
1.39k
    // "preprocessing only" part of the file, so we're no longer in
704
1.39k
    // the preamble.
705
1.39k
    
break469
;
706
3.39k
  } while (true);
707
501
708
501
  SourceLocation End;
709
501
  if (ActiveCommentLoc.isValid())
710
80
    End = ActiveCommentLoc; // don't truncate a decl comment.
711
421
  else
712
421
    End = TheTok.getLocation();
713
501
714
501
  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
715
501
                        TheTok.isAtStartOfLine());
716
501
}
717
718
unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
719
                                     const SourceManager &SM,
720
63.4k
                                     const LangOptions &LangOpts) {
721
63.4k
  // Figure out how many physical characters away the specified expansion
722
63.4k
  // character is.  This needs to take into consideration newlines and
723
63.4k
  // trigraphs.
724
63.4k
  bool Invalid = false;
725
63.4k
  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
726
63.4k
727
63.4k
  // If they request the first char of the token, we're trivially done.
728
63.4k
  if (Invalid || (CharNo == 0 && 
Lexer::isObviouslySimpleCharacter(*TokPtr)2.86k
))
729
2.85k
    return 0;
730
60.6k
731
60.6k
  unsigned PhysOffset = 0;
732
60.6k
733
60.6k
  // The usual case is that tokens don't contain anything interesting.  Skip
734
60.6k
  // over the uninteresting characters.  If a token only consists of simple
735
60.6k
  // chars, this method is extremely fast.
736
497k
  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
737
496k
    if (CharNo == 0)
738
59.3k
      return PhysOffset;
739
437k
    ++TokPtr;
740
437k
    --CharNo;
741
437k
    ++PhysOffset;
742
437k
  }
743
60.6k
744
60.6k
  // If we have a character that may be a trigraph or escaped newline, use a
745
60.6k
  // lexer to parse it correctly.
746
60.6k
  
for (; 1.27k
CharNo9.18k
;
--CharNo7.91k
) {
747
7.91k
    unsigned Size;
748
7.91k
    Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
749
7.91k
    TokPtr += Size;
750
7.91k
    PhysOffset += Size;
751
7.91k
  }
752
1.27k
753
1.27k
  // Final detail: if we end up on an escaped newline, we want to return the
754
1.27k
  // location of the actual byte of the token.  For example foo\<newline>bar
755
1.27k
  // advanced by 3 should return the location of b, not of \\.  One compounding
756
1.27k
  // detail of this is that the escape may be made by a trigraph.
757
1.27k
  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
758
857
    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
759
1.27k
760
1.27k
  return PhysOffset;
761
60.6k
}
762
763
/// Computes the source location just past the end of the
764
/// token at this source location.
765
///
766
/// This routine can be used to produce a source location that
767
/// points just past the end of the token referenced by \p Loc, and
768
/// is generally used when a diagnostic needs to point just after a
769
/// token where it expected something different that it received. If
770
/// the returned source location would not be meaningful (e.g., if
771
/// it points into a macro), this routine returns an invalid
772
/// source location.
773
///
774
/// \param Offset an offset from the end of the token, where the source
775
/// location should refer to. The default offset (0) produces a source
776
/// location pointing just past the end of the token; an offset of 1 produces
777
/// a source location pointing to the last character in the token, etc.
778
SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
779
                                          const SourceManager &SM,
780
4.17M
                                          const LangOptions &LangOpts) {
781
4.17M
  if (Loc.isInvalid())
782
86
    return {};
783
4.17M
784
4.17M
  if (Loc.isMacroID()) {
785
1.38k
    if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
786
1.21k
      return {}; // Points inside the macro expansion.
787
4.17M
  }
788
4.17M
789
4.17M
  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
790
4.17M
  if (Len > Offset)
791
4.17M
    Len = Len - Offset;
792
350
  else
793
350
    return Loc;
794
4.17M
795
4.17M
  return Loc.getLocWithOffset(Len);
796
4.17M
}
797
798
/// Returns true if the given MacroID location points at the first
799
/// token of the macro expansion.
800
bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
801
                                      const SourceManager &SM,
802
                                      const LangOptions &LangOpts,
803
28.8M
                                      SourceLocation *MacroBegin) {
804
28.8M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
805
28.8M
806
28.8M
  SourceLocation expansionLoc;
807
28.8M
  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
808
4.31M
    return false;
809
24.5M
810
24.5M
  if (expansionLoc.isFileID()) {
811
6.43M
    // No other macro expansions, this is the first.
812
6.43M
    if (MacroBegin)
813
160
      *MacroBegin = expansionLoc;
814
6.43M
    return true;
815
6.43M
  }
816
18.0M
817
18.0M
  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
818
18.0M
}
819
820
/// Returns true if the given MacroID location points at the last
821
/// token of the macro expansion.
822
bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
823
                                    const SourceManager &SM,
824
                                    const LangOptions &LangOpts,
825
24.6M
                                    SourceLocation *MacroEnd) {
826
24.6M
  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
827
24.6M
828
24.6M
  SourceLocation spellLoc = SM.getSpellingLoc(loc);
829
24.6M
  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
830
24.6M
  if (tokLen == 0)
831
0
    return false;
832
24.6M
833
24.6M
  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
834
24.6M
  SourceLocation expansionLoc;
835
24.6M
  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
836
4.03M
    return false;
837
20.6M
838
20.6M
  if (expansionLoc.isFileID()) {
839
6.72M
    // No other macro expansions.
840
6.72M
    if (MacroEnd)
841
264
      *MacroEnd = expansionLoc;
842
6.72M
    return true;
843
6.72M
  }
844
13.9M
845
13.9M
  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
846
13.9M
}
847
848
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
849
                                             const SourceManager &SM,
850
4.08M
                                             const LangOptions &LangOpts) {
851
4.08M
  SourceLocation Begin = Range.getBegin();
852
4.08M
  SourceLocation End = Range.getEnd();
853
4.08M
  assert(Begin.isFileID() && End.isFileID());
854
4.08M
  if (Range.isTokenRange()) {
855
4.08M
    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
856
4.08M
    if (End.isInvalid())
857
0
      return {};
858
4.08M
  }
859
4.08M
860
4.08M
  // Break down the source locations.
861
4.08M
  FileID FID;
862
4.08M
  unsigned BeginOffs;
863
4.08M
  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
864
4.08M
  if (FID.isInvalid())
865
0
    return {};
866
4.08M
867
4.08M
  unsigned EndOffs;
868
4.08M
  if (!SM.isInFileID(End, FID, &EndOffs) ||
869
4.08M
      BeginOffs > EndOffs)
870
0
    return {};
871
4.08M
872
4.08M
  return CharSourceRange::getCharRange(Begin, End);
873
4.08M
}
874
875
CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
876
                                         const SourceManager &SM,
877
4.08M
                                         const LangOptions &LangOpts) {
878
4.08M
  SourceLocation Begin = Range.getBegin();
879
4.08M
  SourceLocation End = Range.getEnd();
880
4.08M
  if (Begin.isInvalid() || 
End.isInvalid()4.08M
)
881
5
    return {};
882
4.08M
883
4.08M
  if (Begin.isFileID() && 
End.isFileID()4.08M
)
884
4.08M
    return makeRangeFromFileLocs(Range, SM, LangOpts);
885
254
886
254
  if (Begin.isMacroID() && 
End.isFileID()247
) {
887
36
    if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
888
0
      return {};
889
36
    Range.setBegin(Begin);
890
36
    return makeRangeFromFileLocs(Range, SM, LangOpts);
891
36
  }
892
218
893
218
  if (Begin.isFileID() && 
End.isMacroID()7
) {
894
7
    if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
895
7
                                                          &End)) ||
896
7
        (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
897
0
                                                           &End)))
898
0
      return {};
899
7
    Range.setEnd(End);
900
7
    return makeRangeFromFileLocs(Range, SM, LangOpts);
901
7
  }
902
211
903
211
  assert(Begin.isMacroID() && End.isMacroID());
904
211
  SourceLocation MacroBegin, MacroEnd;
905
211
  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
906
211
      
(98
(98
Range.isTokenRange()98
&& isAtEndOfMacroExpansion(End, SM, LangOpts,
907
96
                                                        &MacroEnd)) ||
908
98
       
(20
Range.isCharRange()20
&& isAtStartOfMacroExpansion(End, SM, LangOpts,
909
78
                                                         &MacroEnd)))) {
910
78
    Range.setBegin(MacroBegin);
911
78
    Range.setEnd(MacroEnd);
912
78
    return makeRangeFromFileLocs(Range, SM, LangOpts);
913
78
  }
914
133
915
133
  bool Invalid = false;
916
133
  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
917
133
                                                        &Invalid);
918
133
  if (Invalid)
919
0
    return {};
920
133
921
133
  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
922
112
    const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
923
112
                                                        &Invalid);
924
112
    if (Invalid)
925
0
      return {};
926
112
927
112
    if (EndEntry.getExpansion().isMacroArgExpansion() &&
928
112
        BeginEntry.getExpansion().getExpansionLocStart() ==
929
111
            EndEntry.getExpansion().getExpansionLocStart()) {
930
110
      Range.setBegin(SM.getImmediateSpellingLoc(Begin));
931
110
      Range.setEnd(SM.getImmediateSpellingLoc(End));
932
110
      return makeFileCharRange(Range, SM, LangOpts);
933
110
    }
934
23
  }
935
23
936
23
  return {};
937
23
}
938
939
StringRef Lexer::getSourceText(CharSourceRange Range,
940
                               const SourceManager &SM,
941
                               const LangOptions &LangOpts,
942
4.07M
                               bool *Invalid) {
943
4.07M
  Range = makeFileCharRange(Range, SM, LangOpts);
944
4.07M
  if (Range.isInvalid()) {
945
9
    if (Invalid) 
*Invalid = true2
;
946
9
    return {};
947
9
  }
948
4.07M
949
4.07M
  // Break down the source location.
950
4.07M
  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
951
4.07M
  if (beginInfo.first.isInvalid()) {
952
0
    if (Invalid) *Invalid = true;
953
0
    return {};
954
0
  }
955
4.07M
956
4.07M
  unsigned EndOffs;
957
4.07M
  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
958
4.07M
      beginInfo.second > EndOffs) {
959
0
    if (Invalid) *Invalid = true;
960
0
    return {};
961
0
  }
962
4.07M
963
4.07M
  // Try to the load the file buffer.
964
4.07M
  bool invalidTemp = false;
965
4.07M
  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
966
4.07M
  if (invalidTemp) {
967
0
    if (Invalid) *Invalid = true;
968
0
    return {};
969
0
  }
970
4.07M
971
4.07M
  if (Invalid) 
*Invalid = false249
;
972
4.07M
  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
973
4.07M
}
974
975
StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
976
                                       const SourceManager &SM,
977
6.25k
                                       const LangOptions &LangOpts) {
978
6.25k
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
979
6.25k
980
6.25k
  // Find the location of the immediate macro expansion.
981
20.5k
  while (true) {
982
20.5k
    FileID FID = SM.getFileID(Loc);
983
20.5k
    const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
984
20.5k
    const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
985
20.5k
    Loc = Expansion.getExpansionLocStart();
986
20.5k
    if (!Expansion.isMacroArgExpansion())
987
4.08k
      break;
988
16.4k
989
16.4k
    // For macro arguments we need to check that the argument did not come
990
16.4k
    // from an inner macro, e.g: "MAC1( MAC2(foo) )"
991
16.4k
992
16.4k
    // Loc points to the argument id of the macro definition, move to the
993
16.4k
    // macro expansion.
994
16.4k
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
995
16.4k
    SourceLocation SpellLoc = Expansion.getSpellingLoc();
996
16.4k
    if (SpellLoc.isFileID())
997
2.16k
      break; // No inner macro.
998
14.3k
999
14.3k
    // If spelling location resides in the same FileID as macro expansion
1000
14.3k
    // location, it means there is no inner macro.
1001
14.3k
    FileID MacroFID = SM.getFileID(Loc);
1002
14.3k
    if (SM.isInFileID(SpellLoc, MacroFID))
1003
1
      break;
1004
14.3k
1005
14.3k
    // Argument came from inner macro.
1006
14.3k
    Loc = SpellLoc;
1007
14.3k
  }
1008
6.25k
1009
6.25k
  // Find the spelling location of the start of the non-argument expansion
1010
6.25k
  // range. This is where the macro name was spelled in order to begin
1011
6.25k
  // expanding this macro.
1012
6.25k
  Loc = SM.getSpellingLoc(Loc);
1013
6.25k
1014
6.25k
  // Dig out the buffer where the macro name was spelled and the extents of the
1015
6.25k
  // name so that we can render it into the expansion note.
1016
6.25k
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1017
6.25k
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1018
6.25k
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1019
6.25k
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1020
6.25k
}
1021
1022
StringRef Lexer::getImmediateMacroNameForDiagnostics(
1023
570
    SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1024
570
  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1025
570
  // Walk past macro argument expansions.
1026
570
  while (SM.isMacroArgExpansion(Loc))
1027
0
    Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1028
570
1029
570
  // If the macro's spelling has no FileID, then it's actually a token paste
1030
570
  // or stringization (or similar) and not a macro at all.
1031
570
  if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc))))
1032
42
    return {};
1033
528
1034
528
  // Find the spelling location of the start of the non-argument expansion
1035
528
  // range. This is where the macro name was spelled in order to begin
1036
528
  // expanding this macro.
1037
528
  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1038
528
1039
528
  // Dig out the buffer where the macro name was spelled and the extents of the
1040
528
  // name so that we can render it into the expansion note.
1041
528
  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1042
528
  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1043
528
  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1044
528
  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1045
528
}
1046
1047
1.32k
bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
1048
1.32k
  return isIdentifierBody(c, LangOpts.DollarIdents);
1049
1.32k
}
1050
1051
10.3k
bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1052
10.3k
  assert(isVerticalWhitespace(Str[0]));
1053
10.3k
  if (Str - 1 < BufferStart)
1054
2
    return false;
1055
10.3k
1056
10.3k
  if ((Str[0] == '\n' && 
Str[-1] == '\r'10.3k
) ||
1057
10.3k
      
(10.3k
Str[0] == '\r'10.3k
&&
Str[-1] == '\n'18
)) {
1058
18
    if (Str - 2 < BufferStart)
1059
2
      return false;
1060
16
    --Str;
1061
16
  }
1062
10.3k
  --Str;
1063
10.3k
1064
10.3k
  // Rewind to first non-space character:
1065
10.4k
  while (Str > BufferStart && 
isHorizontalWhitespace(*Str)10.4k
)
1066
106
    --Str;
1067
10.3k
1068
10.3k
  return *Str == '\\';
1069
10.3k
}
1070
1071
StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1072
163
                                       const SourceManager &SM) {
1073
163
  if (Loc.isInvalid() || Loc.isMacroID())
1074
0
    return {};
1075
163
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1076
163
  if (LocInfo.first.isInvalid())
1077
0
    return {};
1078
163
  bool Invalid = false;
1079
163
  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1080
163
  if (Invalid)
1081
0
    return {};
1082
163
  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1083
163
  if (!Line)
1084
0
    return {};
1085
163
  StringRef Rest = Buffer.substr(Line - Buffer.data());
1086
163
  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1087
163
  return NumWhitespaceChars == StringRef::npos
1088
163
             ? 
""0
1089
163
             : Rest.take_front(NumWhitespaceChars);
1090
163
}
1091
1092
//===----------------------------------------------------------------------===//
1093
// Diagnostics forwarding code.
1094
//===----------------------------------------------------------------------===//
1095
1096
/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1097
/// lexer buffer was all expanded at a single point, perform the mapping.
1098
/// This is currently only used for _Pragma implementation, so it is the slow
1099
/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1100
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1101
    Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1102
static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1103
                                        SourceLocation FileLoc,
1104
2.31M
                                        unsigned CharNo, unsigned TokLen) {
1105
2.31M
  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1106
2.31M
1107
2.31M
  // Otherwise, we're lexing "mapped tokens".  This is used for things like
1108
2.31M
  // _Pragma handling.  Combine the expansion location of FileLoc with the
1109
2.31M
  // spelling location.
1110
2.31M
  SourceManager &SM = PP.getSourceManager();
1111
2.31M
1112
2.31M
  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1113
2.31M
  // characters come from spelling(FileLoc)+Offset.
1114
2.31M
  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1115
2.31M
  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1116
2.31M
1117
2.31M
  // Figure out the expansion loc range, which is the range covered by the
1118
2.31M
  // original _Pragma(...) sequence.
1119
2.31M
  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1120
2.31M
1121
2.31M
  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1122
2.31M
}
1123
1124
/// getSourceLocation - Return a source location identifier for the specified
1125
/// offset in the current file.
1126
SourceLocation Lexer::getSourceLocation(const char *Loc,
1127
2.09G
                                        unsigned TokLen) const {
1128
2.09G
  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1129
2.09G
         "Location out of range for this buffer!");
1130
2.09G
1131
2.09G
  // In the normal case, we're just lexing from a simple file buffer, return
1132
2.09G
  // the file id from FileLoc with the offset specified.
1133
2.09G
  unsigned CharNo = Loc-BufferStart;
1134
2.09G
  if (FileLoc.isFileID())
1135
2.09G
    return FileLoc.getLocWithOffset(CharNo);
1136
2.31M
1137
2.31M
  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1138
2.31M
  // tokens are lexed from where the _Pragma was defined.
1139
2.31M
  assert(PP && "This doesn't work on raw lexers");
1140
2.31M
  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1141
2.31M
}
1142
1143
/// Diag - Forwarding function for diagnostics.  This translate a source
1144
/// position in the current buffer into a SourceLocation object for rendering.
1145
49.7k
DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1146
49.7k
  return PP->Diag(getSourceLocation(Loc), DiagID);
1147
49.7k
}
1148
1149
//===----------------------------------------------------------------------===//
1150
// Trigraph and Escaped Newline Handling Code.
1151
//===----------------------------------------------------------------------===//
1152
1153
/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1154
/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1155
252
static char GetTrigraphCharForLetter(char Letter) {
1156
252
  switch (Letter) {
1157
121
  default:   return 0;
1158
21
  case '=':  return '#';
1159
31
  case ')':  return ']';
1160
30
  case '(':  return '[';
1161
6
  case '!':  return '|';
1162
1
  case '\'': return '^';
1163
4
  case '>':  return '}';
1164
32
  case '/':  return '\\';
1165
3
  case '<':  return '{';
1166
3
  case '-':  return '~';
1167
252
  }
1168
252
}
1169
1170
/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1171
/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1172
/// return the result character.  Finally, emit a warning about trigraph use
1173
/// whether trigraphs are enabled or not.
1174
224
static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1175
224
  char Res = GetTrigraphCharForLetter(*CP);
1176
224
  if (!Res || 
!L103
)
return Res134
;
1177
90
1178
90
  if (!L->getLangOpts().Trigraphs) {
1179
33
    if (!L->isLexingRawMode())
1180
23
      L->Diag(CP-2, diag::trigraph_ignored);
1181
33
    return 0;
1182
33
  }
1183
57
1184
57
  if (!L->isLexingRawMode())
1185
46
    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1186
57
  return Res;
1187
57
}
1188
1189
/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1190
/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1191
/// trigraph equivalent on entry to this function.
1192
4.82M
unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1193
4.82M
  unsigned Size = 0;
1194
4.82M
  while (isWhitespace(Ptr[Size])) {
1195
4.82M
    ++Size;
1196
4.82M
1197
4.82M
    if (Ptr[Size-1] != '\n' && 
Ptr[Size-1] != '\r'257
)
1198
199
      continue;
1199
4.82M
1200
4.82M
    // If this is a \r\n or \n\r, skip the other half.
1201
4.82M
    if ((Ptr[Size] == '\r' || 
Ptr[Size] == '\n'4.82M
) &&
1202
4.82M
        
Ptr[Size-1] != Ptr[Size]86
)
1203
58
      ++Size;
1204
4.82M
1205
4.82M
    return Size;
1206
4.82M
  }
1207
4.82M
1208
4.82M
  // Not an escaped newline, must be a \t or something else.
1209
4.82M
  
return 0866
;
1210
4.82M
}
1211
1212
/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1213
/// them), skip over them and return the first non-escaped-newline found,
1214
/// otherwise return P.
1215
857
const char *Lexer::SkipEscapedNewLines(const char *P) {
1216
870
  while (true) {
1217
870
    const char *AfterEscape;
1218
870
    if (*P == '\\') {
1219
856
      AfterEscape = P+1;
1220
856
    } else 
if (14
*P == '?'14
) {
1221
1
      // If not a trigraph for escape, bail out.
1222
1
      if (P[1] != '?' || 
P[2] != '/'0
)
1223
1
        return P;
1224
0
      // FIXME: Take LangOpts into account; the language might not
1225
0
      // support trigraphs.
1226
0
      AfterEscape = P+3;
1227
13
    } else {
1228
13
      return P;
1229
13
    }
1230
856
1231
856
    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1232
856
    if (NewLineSize == 0) 
return P843
;
1233
13
    P = AfterEscape+NewLineSize;
1234
13
  }
1235
857
}
1236
1237
Optional<Token> Lexer::findNextToken(SourceLocation Loc,
1238
                                     const SourceManager &SM,
1239
497
                                     const LangOptions &LangOpts) {
1240
497
  if (Loc.isMacroID()) {
1241
3
    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1242
3
      return None;
1243
494
  }
1244
494
  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1245
494
1246
494
  // Break down the source location.
1247
494
  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1248
494
1249
494
  // Try to load the file buffer.
1250
494
  bool InvalidTemp = false;
1251
494
  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1252
494
  if (InvalidTemp)
1253
0
    return None;
1254
494
1255
494
  const char *TokenBegin = File.data() + LocInfo.second;
1256
494
1257
494
  // Lex from the start of the given location.
1258
494
  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1259
494
                                      TokenBegin, File.end());
1260
494
  // Find the token.
1261
494
  Token Tok;
1262
494
  lexer.LexFromRawLexer(Tok);
1263
494
  return Tok;
1264
494
}
1265
1266
/// Checks that the given token is the first token that occurs after the
1267
/// given location (this excludes comments and whitespace). Returns the location
1268
/// immediately after the specified token. If the token is not found or the
1269
/// location is inside a macro, the returned source location will be invalid.
1270
SourceLocation Lexer::findLocationAfterToken(
1271
    SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1272
438
    const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1273
438
  Optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1274
438
  if (!Tok || 
Tok->isNot(TKind)436
)
1275
53
    return {};
1276
385
  SourceLocation TokenLoc = Tok->getLocation();
1277
385
1278
385
  // Calculate how much whitespace needs to be skipped if any.
1279
385
  unsigned NumWhitespaceChars = 0;
1280
385
  if (SkipTrailingWhitespaceAndNewLine) {
1281
206
    const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1282
206
    unsigned char C = *TokenEnd;
1283
415
    while (isHorizontalWhitespace(C)) {
1284
209
      C = *(++TokenEnd);
1285
209
      NumWhitespaceChars++;
1286
209
    }
1287
206
1288
206
    // Skip \r, \n, \r\n, or \n\r
1289
206
    if (C == '\n' || 
C == '\r'113
) {
1290
94
      char PrevC = C;
1291
94
      C = *(++TokenEnd);
1292
94
      NumWhitespaceChars++;
1293
94
      if ((C == '\n' || 
C == '\r'93
) &&
C != PrevC1
)
1294
1
        NumWhitespaceChars++;
1295
94
    }
1296
206
  }
1297
385
1298
385
  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1299
385
}
1300
1301
/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1302
/// get its size, and return it.  This is tricky in several cases:
1303
///   1. If currently at the start of a trigraph, we warn about the trigraph,
1304
///      then either return the trigraph (skipping 3 chars) or the '?',
1305
///      depending on whether trigraphs are enabled or not.
1306
///   2. If this is an escaped newline (potentially with whitespace between
1307
///      the backslash and newline), implicitly skip the newline and return
1308
///      the char after it.
1309
///
1310
/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1311
/// know that we can accumulate into Size, and that we have already incremented
1312
/// Ptr by Size bytes.
1313
///
1314
/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1315
/// be updated to match.
1316
char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1317
10.2M
                               Token *Tok) {
1318
10.2M
  // If we have a slash, look for an escaped newline.
1319
10.2M
  if (Ptr[0] == '\\') {
1320
5.11M
    ++Size;
1321
5.11M
    ++Ptr;
1322
5.11M
Slash:
1323
5.11M
    // Common case, backslash-char where the char is not whitespace.
1324
5.11M
    if (!isWhitespace(Ptr[0])) 
return '\\'296k
;
1325
4.81M
1326
4.81M
    // See if we have optional whitespace characters between the slash and
1327
4.81M
    // newline.
1328
4.81M
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1329
4.81M
      // Remember that this token needs to be cleaned.
1330
4.81M
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)4.81M
;
1331
4.81M
1332
4.81M
      // Warn if there was whitespace between the backslash and newline.
1333
4.81M
      if (Ptr[0] != '\n' && 
Ptr[0] != '\r'79
&&
Tok23
&&
!isLexingRawMode()16
)
1334
8
        Diag(Ptr, diag::backslash_newline_space);
1335
4.81M
1336
4.81M
      // Found backslash<whitespace><newline>.  Parse the char after it.
1337
4.81M
      Size += EscapedNewLineSize;
1338
4.81M
      Ptr  += EscapedNewLineSize;
1339
4.81M
1340
4.81M
      // Use slow version to accumulate a correct size field.
1341
4.81M
      return getCharAndSizeSlow(Ptr, Size, Tok);
1342
4.81M
    }
1343
24
1344
24
    // Otherwise, this is not an escaped newline, just return the slash.
1345
24
    return '\\';
1346
24
  }
1347
5.13M
1348
5.13M
  // If this is a trigraph, process it.
1349
5.13M
  if (Ptr[0] == '?' && 
Ptr[1] == '?'314k
) {
1350
224
    // If this is actually a legal trigraph (not something like "??x"), emit
1351
224
    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1352
224
    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) {
1353
70
      // Remember that this token needs to be cleaned.
1354
70
      if (Tok) 
Tok->setFlag(Token::NeedsCleaning)57
;
1355
70
1356
70
      Ptr += 3;
1357
70
      Size += 3;
1358
70
      if (C == '\\') 
goto Slash18
;
1359
52
      return C;
1360
52
    }
1361
224
  }
1362
5.13M
1363
5.13M
  // If this is neither, return a single character.
1364
5.13M
  ++Size;
1365
5.13M
  return *Ptr;
1366
5.13M
}
1367
1368
/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1369
/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1370
/// and that we have already incremented Ptr by Size bytes.
1371
///
1372
/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1373
/// be updated to match.
1374
char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1375
19.7k
                                     const LangOptions &LangOpts) {
1376
19.7k
  // If we have a slash, look for an escaped newline.
1377
19.7k
  if (Ptr[0] == '\\') {
1378
10.4k
    ++Size;
1379
10.4k
    ++Ptr;
1380
10.4k
Slash:
1381
10.4k
    // Common case, backslash-char where the char is not whitespace.
1382
10.4k
    if (!isWhitespace(Ptr[0])) 
return '\\'1.24k
;
1383
9.24k
1384
9.24k
    // See if we have optional whitespace characters followed by a newline.
1385
9.24k
    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1386
9.24k
      // Found backslash<whitespace><newline>.  Parse the char after it.
1387
9.24k
      Size += EscapedNewLineSize;
1388
9.24k
      Ptr  += EscapedNewLineSize;
1389
9.24k
1390
9.24k
      // Use slow version to accumulate a correct size field.
1391
9.24k
      return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
1392
9.24k
    }
1393
0
1394
0
    // Otherwise, this is not an escaped newline, just return the slash.
1395
0
    return '\\';
1396
0
  }
1397
9.31k
1398
9.31k
  // If this is a trigraph, process it.
1399
9.31k
  if (LangOpts.Trigraphs && 
Ptr[0] == '?'5.24k
&&
Ptr[1] == '?'28
) {
1400
28
    // If this is actually a legal trigraph (not something like "??x"), return
1401
28
    // it.
1402
28
    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1403
28
      Ptr += 3;
1404
28
      Size += 3;
1405
28
      if (C == '\\') 
goto Slash8
;
1406
20
      return C;
1407
20
    }
1408
28
  }
1409
9.28k
1410
9.28k
  // If this is neither, return a single character.
1411
9.28k
  ++Size;
1412
9.28k
  return *Ptr;
1413
9.28k
}
1414
1415
//===----------------------------------------------------------------------===//
1416
// Helper methods for lexing.
1417
//===----------------------------------------------------------------------===//
1418
1419
/// Routine that indiscriminately sets the offset into the source file.
1420
417
void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1421
417
  BufferPtr = BufferStart + Offset;
1422
417
  if (BufferPtr > BufferEnd)
1423
0
    BufferPtr = BufferEnd;
1424
417
  // FIXME: What exactly does the StartOfLine bit mean?  There are two
1425
417
  // possible meanings for the "start" of the line: the first token on the
1426
417
  // unexpanded line, or the first token on the expanded line.
1427
417
  IsAtStartOfLine = StartOfLine;
1428
417
  IsAtPhysicalStartOfLine = StartOfLine;
1429
417
}
1430
1431
695
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
1432
695
  if (LangOpts.AsmPreprocessor) {
1433
4
    return false;
1434
691
  } else if (LangOpts.DollarIdents && '$' == C) {
1435
3
    return true;
1436
688
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C11425
) {
1437
517
    static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1438
517
        C11AllowedIDCharRanges);
1439
517
    return C11AllowedIDChars.contains(C);
1440
517
  } else 
if (171
LangOpts.CPlusPlus171
) {
1441
53
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1442
53
        CXX03AllowedIDCharRanges);
1443
53
    return CXX03AllowedIDChars.contains(C);
1444
118
  } else {
1445
118
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1446
118
        C99AllowedIDCharRanges);
1447
118
    return C99AllowedIDChars.contains(C);
1448
118
  }
1449
695
}
1450
1451
125
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
1452
125
  assert(isAllowedIDChar(C, LangOpts));
1453
125
  if (LangOpts.AsmPreprocessor) {
1454
0
    return false;
1455
125
  } else if (LangOpts.CPlusPlus11 || 
LangOpts.C1175
) {
1456
99
    static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1457
99
        C11DisallowedInitialIDCharRanges);
1458
99
    return !C11DisallowedInitialIDChars.contains(C);
1459
99
  } else 
if (26
LangOpts.CPlusPlus26
) {
1460
6
    return true;
1461
20
  } else {
1462
20
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1463
20
        C99DisallowedInitialIDCharRanges);
1464
20
    return !C99DisallowedInitialIDChars.contains(C);
1465
20
  }
1466
125
}
1467
1468
static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1469
625
                                            const char *End) {
1470
625
  return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1471
625
                                       L.getSourceLocation(End));
1472
625
}
1473
1474
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1475
357
                                      CharSourceRange Range, bool IsFirst) {
1476
357
  // Check C99 compatibility.
1477
357
  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1478
12
    enum {
1479
12
      CannotAppearInIdentifier = 0,
1480
12
      CannotStartIdentifier
1481
12
    };
1482
12
1483
12
    static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1484
12
        C99AllowedIDCharRanges);
1485
12
    static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1486
12
        C99DisallowedInitialIDCharRanges);
1487
12
    if (!C99AllowedIDChars.contains(C)) {
1488
5
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1489
5
        << Range
1490
5
        << CannotAppearInIdentifier;
1491
7
    } else if (IsFirst && 
C99DisallowedInitialIDChars.contains(C)3
) {
1492
2
      Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1493
2
        << Range
1494
2
        << CannotStartIdentifier;
1495
2
    }
1496
12
  }
1497
357
1498
357
  // Check C++98 compatibility.
1499
357
  if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) {
1500
12
    static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars(
1501
12
        CXX03AllowedIDCharRanges);
1502
12
    if (!CXX03AllowedIDChars.contains(C)) {
1503
5
      Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
1504
5
        << Range;
1505
5
    }
1506
12
  }
1507
357
}
1508
1509
/// After encountering UTF-8 character C and interpreting it as an identifier
1510
/// character, check whether it's a homoglyph for a common non-identifier
1511
/// source character that is unlikely to be an intentional identifier
1512
/// character and warn if so.
1513
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1514
237
                                       CharSourceRange Range) {
1515
237
  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1516
237
  struct HomoglyphPair {
1517
237
    uint32_t Character;
1518
237
    char LooksLike;
1519
1.39k
    bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1520
237
  };
1521
237
  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1522
237
    {U'\u00ad', 0},   // SOFT HYPHEN
1523
237
    {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1524
237
    {U'\u037e', ';'}, // GREEK QUESTION MARK
1525
237
    {U'\u200b', 0},   // ZERO WIDTH SPACE
1526
237
    {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1527
237
    {U'\u200d', 0},   // ZERO WIDTH JOINER
1528
237
    {U'\u2060', 0},   // WORD JOINER
1529
237
    {U'\u2061', 0},   // FUNCTION APPLICATION
1530
237
    {U'\u2062', 0},   // INVISIBLE TIMES
1531
237
    {U'\u2063', 0},   // INVISIBLE SEPARATOR
1532
237
    {U'\u2064', 0},   // INVISIBLE PLUS
1533
237
    {U'\u2212', '-'}, // MINUS SIGN
1534
237
    {U'\u2215', '/'}, // DIVISION SLASH
1535
237
    {U'\u2216', '\\'}, // SET MINUS
1536
237
    {U'\u2217', '*'}, // ASTERISK OPERATOR
1537
237
    {U'\u2223', '|'}, // DIVIDES
1538
237
    {U'\u2227', '^'}, // LOGICAL AND
1539
237
    {U'\u2236', ':'}, // RATIO
1540
237
    {U'\u223c', '~'}, // TILDE OPERATOR
1541
237
    {U'\ua789', ':'}, // MODIFIER LETTER COLON
1542
237
    {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1543
237
    {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1544
237
    {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1545
237
    {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1546
237
    {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1547
237
    {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1548
237
    {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1549
237
    {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1550
237
    {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1551
237
    {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1552
237
    {U'\uff0c', ','}, // FULLWIDTH COMMA
1553
237
    {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1554
237
    {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1555
237
    {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1556
237
    {U'\uff1a', ':'}, // FULLWIDTH COLON
1557
237
    {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1558
237
    {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1559
237
    {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1560
237
    {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1561
237
    {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1562
237
    {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1563
237
    {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1564
237
    {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1565
237
    {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1566
237
    {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1567
237
    {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1568
237
    {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1569
237
    {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1570
237
    {U'\uff5e', '~'}, // FULLWIDTH TILDE
1571
237
    {0, 0}
1572
237
  };
1573
237
  auto Homoglyph =
1574
237
      std::lower_bound(std::begin(SortedHomoglyphs),
1575
237
                       std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1576
237
  if (Homoglyph->Character == C) {
1577
59
    llvm::SmallString<5> CharBuf;
1578
59
    {
1579
59
      llvm::raw_svector_ostream CharOS(CharBuf);
1580
59
      llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1581
59
    }
1582
59
    if (Homoglyph->LooksLike) {
1583
48
      const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1584
48
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1585
48
          << Range << CharBuf << LooksLikeStr;
1586
48
    } else {
1587
11
      Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1588
11
          << Range << CharBuf;
1589
11
    }
1590
59
  }
1591
237
}
1592
1593
bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1594
174
                                    Token &Result) {
1595
174
  const char *UCNPtr = CurPtr + Size;
1596
174
  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1597
174
  if (CodePoint == 0 || 
!isAllowedIDChar(CodePoint, LangOpts)146
)
1598
54
    return false;
1599
120
1600
120
  if (!isLexingRawMode())
1601
120
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1602
120
                              makeCharRange(*this, CurPtr, UCNPtr),
1603
120
                              /*IsFirst=*/false);
1604
120
1605
120
  Result.setFlag(Token::HasUCN);
1606
120
  if ((UCNPtr - CurPtr ==  6 && 
CurPtr[1] == 'u'94
) ||
1607
120
      
(26
UCNPtr - CurPtr == 1026
&&
CurPtr[1] == 'U'17
))
1608
105
    CurPtr = UCNPtr;
1609
15
  else
1610
105
    
while (15
CurPtr != UCNPtr)
1611
90
      (void)getAndAdvanceChar(CurPtr, Result);
1612
120
  return true;
1613
120
}
1614
1615
226
bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) {
1616
226
  const char *UnicodePtr = CurPtr;
1617
226
  llvm::UTF32 CodePoint;
1618
226
  llvm::ConversionResult Result =
1619
226
      llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr,
1620
226
                                (const llvm::UTF8 *)BufferEnd,
1621
226
                                &CodePoint,
1622
226
                                llvm::strictConversion);
1623
226
  if (Result != llvm::conversionOK ||
1624
226
      
!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)211
)
1625
45
    return false;
1626
181
1627
181
  if (!isLexingRawMode()) {
1628
155
    maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1629
155
                              makeCharRange(*this, CurPtr, UnicodePtr),
1630
155
                              /*IsFirst=*/false);
1631
155
    maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1632
155
                               makeCharRange(*this, CurPtr, UnicodePtr));
1633
155
  }
1634
181
1635
181
  CurPtr = UnicodePtr;
1636
181
  return true;
1637
181
}
1638
1639
960M
bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1640
960M
  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1641
960M
  unsigned Size;
1642
960M
  unsigned char C = *CurPtr++;
1643
16.1G
  while (isIdentifierBody(C))
1644
15.2G
    C = *CurPtr++;
1645
960M
1646
960M
  --CurPtr;   // Back up over the skipped character.
1647
960M
1648
960M
  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1649
960M
  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1650
960M
  //
1651
960M
  // TODO: Could merge these checks into an InfoTable flag to make the
1652
960M
  // comparison cheaper
1653
960M
  if (
isASCII(C)960M
&& C != '\\' &&
C != '?'960M
&&
1654
960M
      
(960M
C != '$'960M
||
!LangOpts.DollarIdents2.51k
)) {
1655
960M
FinishIdentifier:
1656
960M
    const char *IdStart = BufferPtr;
1657
960M
    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1658
960M
    Result.setRawIdentifierData(IdStart);
1659
960M
1660
960M
    // If we are in raw mode, return this identifier raw.  There is no need to
1661
960M
    // look up identifier information or attempt to macro expand it.
1662
960M
    if (LexingRawMode)
1663
589M
      return true;
1664
371M
1665
371M
    // Fill in Result.IdentifierInfo and update the token kind,
1666
371M
    // looking up the identifier in the identifier table.
1667
371M
    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1668
371M
    // Note that we have to call PP->LookUpIdentifierInfo() even for code
1669
371M
    // completion, it writes IdentifierInfo into Result, and callers rely on it.
1670
371M
1671
371M
    // If the completion point is at the end of an identifier, we want to treat
1672
371M
    // the identifier as incomplete even if it resolves to a macro or a keyword.
1673
371M
    // This allows e.g. 'class^' to complete to 'classifier'.
1674
371M
    if (isCodeCompletionPoint(CurPtr)) {
1675
85
      // Return the code-completion token.
1676
85
      Result.setKind(tok::code_completion);
1677
85
      // Skip the code-completion char and all immediate identifier characters.
1678
85
      // This ensures we get consistent behavior when completing at any point in
1679
85
      // an identifier (i.e. at the start, in the middle, at the end). Note that
1680
85
      // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1681
85
      // simpler.
1682
85
      assert(*CurPtr == 0 && "Completion character must be 0");
1683
85
      ++CurPtr;
1684
85
      // Note that code completion token is not added as a separate character
1685
85
      // when the completion point is at the end of the buffer. Therefore, we need
1686
85
      // to check if the buffer has ended.
1687
85
      if (CurPtr < BufferEnd) {
1688
131
        while (isIdentifierBody(*CurPtr))
1689
47
          ++CurPtr;
1690
84
      }
1691
85
      BufferPtr = CurPtr;
1692
85
      return true;
1693
85
    }
1694
371M
1695
371M
    // Finally, now that we know we have an identifier, pass this off to the
1696
371M
    // preprocessor, which may macro expand it or something.
1697
371M
    if (II->isHandleIdentifierCase())
1698
34.1M
      return PP->HandleIdentifier(Result);
1699
337M
1700
337M
    return true;
1701
337M
  }
1702
2.80k
1703
2.80k
  // Otherwise, $,\,? in identifier found.  Enter slower path.
1704
2.80k
1705
2.80k
  C = getCharAndSize(CurPtr, Size);
1706
11.2k
  while (
true8.92k
) {
1707
11.2k
    if (C == '$') {
1708
2.89k
      // If we hit a $ and they are not supported in identifiers, we are done.
1709
2.89k
      if (!LangOpts.DollarIdents) 
goto FinishIdentifier0
;
1710
2.89k
1711
2.89k
      // Otherwise, emit a diagnostic and continue.
1712
2.89k
      if (!isLexingRawMode())
1713
2.89k
        Diag(CurPtr, diag::ext_dollar_in_identifier);
1714
2.89k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1715
2.89k
      C = getCharAndSize(CurPtr, Size);
1716
2.89k
      continue;
1717
8.31k
    } else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)144
) {
1718
98
      C = getCharAndSize(CurPtr, Size);
1719
98
      continue;
1720
8.21k
    } else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)195
) {
1721
163
      C = getCharAndSize(CurPtr, Size);
1722
163
      continue;
1723
8.05k
    } else if (!isIdentifierBody(C)) {
1724
5.07k
      goto FinishIdentifier;
1725
5.07k
    }
1726
2.97k
1727
2.97k
    // Otherwise, this character is good, consume it.
1728
2.97k
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1729
2.97k
1730
2.97k
    C = getCharAndSize(CurPtr, Size);
1731
32.5k
    while (isIdentifierBody(C)) {
1732
29.5k
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1733
29.5k
      C = getCharAndSize(CurPtr, Size);
1734
29.5k
    }
1735
2.97k
  }
1736
2.80k
}
1737
1738
/// isHexaLiteral - Return true if Start points to a hex constant.
1739
/// in microsoft mode (where this is supposed to be several different tokens).
1740
54.1k
bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1741
54.1k
  unsigned Size;
1742
54.1k
  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
1743
54.1k
  if (C1 != '0')
1744
37.2k
    return false;
1745
16.8k
  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
1746
16.8k
  return (C2 == 'x' || 
C2 == 'X'15
);
1747
16.8k
}
1748
1749
/// LexNumericConstant - Lex the remainder of a integer or floating point
1750
/// constant. From[-1] is the first character lexed.  Return the end of the
1751
/// constant.
1752
73.1M
bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1753
73.1M
  unsigned Size;
1754
73.1M
  char C = getCharAndSize(CurPtr, Size);
1755
73.1M
  char PrevCh = 0;
1756
258M
  while (isPreprocessingNumberBody(C)) {
1757
185M
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1758
185M
    PrevCh = C;
1759
185M
    C = getCharAndSize(CurPtr, Size);
1760
185M
  }
1761
73.1M
1762
73.1M
  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1763
73.1M
  if ((C == '-' || 
C == '+'72.5M
) &&
(831k
PrevCh == 'E'831k
||
PrevCh == 'e'831k
)) {
1764
693k
    // If we are in Microsoft mode, don't continue if the constant is hex.
1765
693k
    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1766
693k
    if (!LangOpts.MicrosoftExt || 
!isHexaLiteral(BufferPtr, LangOpts)37.2k
)
1767
693k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1768
72.4M
  }
1769
72.4M
1770
72.4M
  // If we have a hex FP constant, continue.
1771
72.4M
  if ((C == '-' || 
C == '+'72.4M
) &&
(138k
PrevCh == 'P'138k
||
PrevCh == 'p'138k
)) {
1772
24.7k
    // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
1773
24.7k
    // not-quite-conforming extension. Only do so if this looks like it's
1774
24.7k
    // actually meant to be a hexfloat, and not if it has a ud-suffix.
1775
24.7k
    bool IsHexFloat = true;
1776
24.7k
    if (!LangOpts.C99) {
1777
16.8k
      if (!isHexaLiteral(BufferPtr, LangOpts))
1778
9
        IsHexFloat = false;
1779
16.8k
      else if (!getLangOpts().CPlusPlus17 &&
1780
16.8k
               
std::find(BufferPtr, CurPtr, '_') != CurPtr16.5k
)
1781
3
        IsHexFloat = false;
1782
16.8k
    }
1783
24.7k
    if (IsHexFloat)
1784
24.7k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1785
72.4M
  }
1786
72.4M
1787
72.4M
  // If we have a digit separator, continue.
1788
72.4M
  if (C == '\'' && 
getLangOpts().CPlusPlus14311
) {
1789
295
    unsigned NextSize;
1790
295
    char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts());
1791
295
    if (isIdentifierBody(Next)) {
1792
285
      if (!isLexingRawMode())
1793
253
        Diag(CurPtr, diag::warn_cxx11_compat_digit_separator);
1794
285
      CurPtr = ConsumeChar(CurPtr, Size, Result);
1795
285
      CurPtr = ConsumeChar(CurPtr, NextSize, Result);
1796
285
      return LexNumericConstant(Result, CurPtr);
1797
285
    }
1798
72.4M
  }
1799
72.4M
1800
72.4M
  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
1801
72.4M
  if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)5
)
1802
4
    return LexNumericConstant(Result, CurPtr);
1803
72.4M
  if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)15
)
1804
4
    return LexNumericConstant(Result, CurPtr);
1805
72.4M
1806
72.4M
  // Update the location of token as well as BufferPtr.
1807
72.4M
  const char *TokStart = BufferPtr;
1808
72.4M
  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1809
72.4M
  Result.setLiteralData(TokStart);
1810
72.4M
  return true;
1811
72.4M
}
1812
1813
/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1814
/// in C++11, or warn on a ud-suffix in C++98.
1815
const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
1816
5.76M
                               bool IsStringLiteral) {
1817
5.76M
  assert(getLangOpts().CPlusPlus);
1818
5.76M
1819
5.76M
  // Maximally munch an identifier.
1820
5.76M
  unsigned Size;
1821
5.76M
  char C = getCharAndSize(CurPtr, Size);
1822
5.76M
  bool Consumed = false;
1823
5.76M
1824
5.76M
  if (!isIdentifierHead(C)) {
1825
5.75M
    if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)7
)
1826
0
      Consumed = true;
1827
5.75M
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)0
)
1828
0
      Consumed = true;
1829
5.75M
    else
1830
5.75M
      return CurPtr;
1831
7.48k
  }
1832
7.48k
1833
7.48k
  if (!getLangOpts().CPlusPlus11) {
1834
17
    if (!isLexingRawMode())
1835
8
      Diag(CurPtr,
1836
8
           C == '_' ? 
diag::warn_cxx11_compat_user_defined_literal3
1837
8
                    : 
diag::warn_cxx11_compat_reserved_user_defined_literal5
)
1838
8
        << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1839
17
    return CurPtr;
1840
17
  }
1841
7.46k
1842
7.46k
  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
1843
7.46k
  // that does not start with an underscore is ill-formed. As a conforming
1844
7.46k
  // extension, we treat all such suffixes as if they had whitespace before
1845
7.46k
  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
1846
7.46k
  // likely to be a ud-suffix than a macro, however, and accept that.
1847
7.46k
  
if (7.46k
!Consumed7.46k
) {
1848
7.46k
    bool IsUDSuffix = false;
1849
7.46k
    if (C == '_')
1850
222
      IsUDSuffix = true;
1851
7.24k
    else if (IsStringLiteral && 
getLangOpts().CPlusPlus147.23k
) {
1852
287
      // In C++1y, we need to look ahead a few characters to see if this is a
1853
287
      // valid suffix for a string literal or a numeric literal (this could be
1854
287
      // the 'operator""if' defining a numeric literal operator).
1855
287
      const unsigned MaxStandardSuffixLength = 3;
1856
287
      char Buffer[MaxStandardSuffixLength] = { C };
1857
287
      unsigned Consumed = Size;
1858
287
      unsigned Chars = 1;
1859
495
      while (true) {
1860
495
        unsigned NextSize;
1861
495
        char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize,
1862
495
                                         getLangOpts());
1863
495
        if (!isIdentifierBody(Next)) {
1864
283
          // End of suffix. Check whether this is on the whitelist.
1865
283
          const StringRef CompleteSuffix(Buffer, Chars);
1866
283
          IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(),
1867
283
                                                            CompleteSuffix);
1868
283
          break;
1869
283
        }
1870
212
1871
212
        if (Chars == MaxStandardSuffixLength)
1872
4
          // Too long: can't be a standard suffix.
1873
4
          break;
1874
208
1875
208
        Buffer[Chars++] = Next;
1876
208
        Consumed += NextSize;
1877
208
      }
1878
287
    }
1879
7.46k
1880
7.46k
    if (!IsUDSuffix) {
1881
6.99k
      if (!isLexingRawMode())
1882
9
        Diag(CurPtr, getLangOpts().MSVCCompat
1883
9
                         ? 
diag::ext_ms_reserved_user_defined_literal0
1884
9
                         : diag::ext_reserved_user_defined_literal)
1885
9
          << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
1886
6.99k
      return CurPtr;
1887
6.99k
    }
1888
475
1889
475
    CurPtr = ConsumeChar(CurPtr, Size, Result);
1890
475
  }
1891
7.46k
1892
7.46k
  Result.setFlag(Token::HasUDSuffix);
1893
1.27k
  while (
true1.27k
) {
1894
1.27k
    C = getCharAndSize(CurPtr, Size);
1895
1.27k
    if (isIdentifierBody(C)) 
{ CurPtr = ConsumeChar(CurPtr, Size, Result); }772
1896
507
    else if (C == '\\' && 
tryConsumeIdentifierUCN(CurPtr, Size, Result)18
)
{}18
1897
489
    else if (!isASCII(C) && 
tryConsumeIdentifierUTF8Char(CurPtr)16
)
{}14
1898
475
    else break;
1899
1.27k
  }
1900
471
1901
471
  return CurPtr;
1902
7.46k
}
1903
1904
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1905
/// either " or L" or u8" or u" or U".
1906
bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1907
9.65M
                             tok::TokenKind Kind) {
1908
9.65M
  const char *AfterQuote = CurPtr;
1909
9.65M
  // Does this string contain the \0 character?
1910
9.65M
  const char *NulCharacter = nullptr;
1911
9.65M
1912
9.65M
  if (!isLexingRawMode() &&
1913
9.65M
      
(7.50M
Kind == tok::utf8_string_literal7.50M
||
1914
7.50M
       
Kind == tok::utf16_string_literal7.50M
||
1915
7.50M
       
Kind == tok::utf32_string_literal7.50M
))
1916
438
    Diag(BufferPtr, getLangOpts().CPlusPlus
1917
438
           ? 
diag::warn_cxx98_compat_unicode_literal365
1918
438
           : 
diag::warn_c99_compat_unicode_literal73
);
1919
9.65M
1920
9.65M
  char C = getAndAdvanceChar(CurPtr, Result);
1921
115M
  while (C != '"') {
1922
105M
    // Skip escaped characters.  Escaped newlines will already be processed by
1923
105M
    // getAndAdvanceChar.
1924
105M
    if (C == '\\')
1925
276k
      C = getAndAdvanceChar(CurPtr, Result);
1926
105M
1927
105M
    if (C == '\n' || 
C == '\r'105M
|| // Newline.
1928
105M
        
(105M
C == 0105M
&&
CurPtr-1 == BufferEnd41
)) { // End of file.
1929
68
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor7
)
1930
5
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
1931
68
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1932
68
      return true;
1933
68
    }
1934
105M
1935
105M
    if (C == 0) {
1936
19
      if (isCodeCompletionPoint(CurPtr-1)) {
1937
10
        if (ParsingFilename)
1938
3
          codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
1939
7
        else
1940
7
          PP->CodeCompleteNaturalLanguage();
1941
10
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
1942
10
        cutOffLexing();
1943
10
        return true;
1944
10
      }
1945
9
1946
9
      NulCharacter = CurPtr-1;
1947
9
    }
1948
105M
    C = getAndAdvanceChar(CurPtr, Result);
1949
105M
  }
1950
9.65M
1951
9.65M
  // If we are in C++11, lex the optional ud-suffix.
1952
9.65M
  
if (9.65M
getLangOpts().CPlusPlus9.65M
)
1953
5.50M
    CurPtr = LexUDSuffix(Result, CurPtr, true);
1954
9.65M
1955
9.65M
  // If a nul character existed in the string, warn about it.
1956
9.65M
  if (NulCharacter && 
!isLexingRawMode()8
)
1957
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
1958
9.65M
1959
9.65M
  // Update the location of the token as well as the BufferPtr instance var.
1960
9.65M
  const char *TokStart = BufferPtr;
1961
9.65M
  FormTokenWithChars(Result, CurPtr, Kind);
1962
9.65M
  Result.setLiteralData(TokStart);
1963
9.65M
  return true;
1964
9.65M
}
1965
1966
/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1967
/// having lexed R", LR", u8R", uR", or UR".
1968
bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1969
465
                                tok::TokenKind Kind) {
1970
465
  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1971
465
  //  Between the initial and final double quote characters of the raw string,
1972
465
  //  any transformations performed in phases 1 and 2 (trigraphs,
1973
465
  //  universal-character-names, and line splicing) are reverted.
1974
465
1975
465
  if (!isLexingRawMode())
1976
104
    Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1977
465
1978
465
  unsigned PrefixLen = 0;
1979
465
1980
1.28k
  while (PrefixLen != 16 && 
isRawStringDelimBody(CurPtr[PrefixLen])1.28k
)
1981
819
    ++PrefixLen;
1982
465
1983
465
  // If the last character was not a '(', then we didn't lex a valid delimiter.
1984
465
  if (CurPtr[PrefixLen] != '(') {
1985
1
    if (!isLexingRawMode()) {
1986
1
      const char *PrefixEnd = &CurPtr[PrefixLen];
1987
1
      if (PrefixLen == 16) {
1988
1
        Diag(PrefixEnd, diag::err_raw_delim_too_long);
1989
1
      } else {
1990
0
        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1991
0
          << StringRef(PrefixEnd, 1);
1992
0
      }
1993
1
    }
1994
1
1995
1
    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1996
1
    // it's possible the '"' was intended to be part of the raw string, but
1997
1
    // there's not much we can do about that.
1998
59
    while (true) {
1999
59
      char C = *CurPtr++;
2000
59
2001
59
      if (C == '"')
2002
1
        break;
2003
58
      if (C == 0 && 
CurPtr-1 == BufferEnd0
) {
2004
0
        --CurPtr;
2005
0
        break;
2006
0
      }
2007
58
    }
2008
1
2009
1
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2010
1
    return true;
2011
1
  }
2012
464
2013
464
  // Save prefix and move CurPtr past it
2014
464
  const char *Prefix = CurPtr;
2015
464
  CurPtr += PrefixLen + 1; // skip over prefix and '('
2016
464
2017
8.17k
  while (true) {
2018
8.17k
    char C = *CurPtr++;
2019
8.17k
2020
8.17k
    if (C == ')') {
2021
468
      // Check for prefix match and closing quote.
2022
468
      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && 
CurPtr[PrefixLen] == '"'460
) {
2023
460
        CurPtr += PrefixLen + 1; // skip over prefix and '"'
2024
460
        break;
2025
460
      }
2026
7.70k
    } else if (C == 0 && 
CurPtr-1 == BufferEnd4
) { // End of file.
2027
4
      if (!isLexingRawMode())
2028
1
        Diag(BufferPtr, diag::err_unterminated_raw_string)
2029
1
          << StringRef(Prefix, PrefixLen);
2030
4
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2031
4
      return true;
2032
4
    }
2033
8.17k
  }
2034
464
2035
464
  // If we are in C++11, lex the optional ud-suffix.
2036
464
  
if (460
getLangOpts().CPlusPlus460
)
2037
460
    CurPtr = LexUDSuffix(Result, CurPtr, true);
2038
460
2039
460
  // Update the location of token as well as BufferPtr.
2040
460
  const char *TokStart = BufferPtr;
2041
460
  FormTokenWithChars(Result, CurPtr, Kind);
2042
460
  Result.setLiteralData(TokStart);
2043
460
  return true;
2044
464
}
2045
2046
/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2047
/// after having lexed the '<' character.  This is used for #include filenames.
2048
2.04M
bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2049
2.04M
  // Does this string contain the \0 character?
2050
2.04M
  const char *NulCharacter = nullptr;
2051
2.04M
  const char *AfterLessPos = CurPtr;
2052
2.04M
  char C = getAndAdvanceChar(CurPtr, Result);
2053
40.4M
  while (C != '>') {
2054
38.3M
    // Skip escaped characters.  Escaped newlines will already be processed by
2055
38.3M
    // getAndAdvanceChar.
2056
38.3M
    if (C == '\\')
2057
6
      C = getAndAdvanceChar(CurPtr, Result);
2058
38.3M
2059
38.3M
    if (C == '\n' || 
C == '\r'38.3M
|| // Newline.
2060
38.3M
        
(38.3M
C == 038.3M
&&
(CurPtr - 1 == BufferEnd)10
)) { // End of file.
2061
10
      // If the filename is unterminated, then it must just be a lone <
2062
10
      // character.  Return this as such.
2063
10
      FormTokenWithChars(Result, AfterLessPos, tok::less);
2064
10
      return true;
2065
10
    }
2066
38.3M
2067
38.3M
    if (C == 0) {
2068
7
      if (isCodeCompletionPoint(CurPtr - 1)) {
2069
6
        codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2070
6
        cutOffLexing();
2071
6
        FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2072
6
        return true;
2073
6
      }
2074
1
      NulCharacter = CurPtr-1;
2075
1
    }
2076
38.3M
    C = getAndAdvanceChar(CurPtr, Result);
2077
38.3M
  }
2078
2.04M
2079
2.04M
  // If a nul character existed in the string, warn about it.
2080
2.04M
  
if (2.04M
NulCharacter2.04M
&&
!isLexingRawMode()1
)
2081
1
    Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2082
2.04M
2083
2.04M
  // Update the location of token as well as BufferPtr.
2084
2.04M
  const char *TokStart = BufferPtr;
2085
2.04M
  FormTokenWithChars(Result, CurPtr, tok::header_name);
2086
2.04M
  Result.setLiteralData(TokStart);
2087
2.04M
  return true;
2088
2.04M
}
2089
2090
void Lexer::codeCompleteIncludedFile(const char *PathStart,
2091
                                     const char *CompletionPoint,
2092
9
                                     bool IsAngled) {
2093
9
  // Completion only applies to the filename, after the last slash.
2094
9
  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2095
9
  auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? 
"/\\"1
:
"/"8
);
2096
9
  StringRef Dir =
2097
9
      (Slash == StringRef::npos) ? 
""5
:
PartialPath.take_front(Slash)4
;
2098
9
  const char *StartOfFilename =
2099
9
      (Slash == StringRef::npos) ? 
PathStart5
:
PathStart + Slash + 14
;
2100
9
  // Code completion filter range is the filename only, up to completion point.
2101
9
  PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2102
9
      StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2103
9
  // We should replace the characters up to the closing quote, if any.
2104
69
  while (CompletionPoint < BufferEnd) {
2105
69
    char Next = *(CompletionPoint + 1);
2106
69
    if (Next == 0 || Next == '\r' || Next == '\n')
2107
0
      break;
2108
69
    ++CompletionPoint;
2109
69
    if (Next == (IsAngled ? 
'>'51
:
'"'18
))
2110
9
      break;
2111
69
  }
2112
9
  PP->setCodeCompletionTokenRange(
2113
9
      FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2114
9
      FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2115
9
  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2116
9
}
2117
2118
/// LexCharConstant - Lex the remainder of a character constant, after having
2119
/// lexed either ' or L' or u8' or u' or U'.
2120
bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2121
1.47M
                            tok::TokenKind Kind) {
2122
1.47M
  // Does this character contain the \0 character?
2123
1.47M
  const char *NulCharacter = nullptr;
2124
1.47M
2125
1.47M
  if (!isLexingRawMode()) {
2126
1.42M
    if (Kind == tok::utf16_char_constant || 
Kind == tok::utf32_char_constant1.42M
)
2127
153
      Diag(BufferPtr, getLangOpts().CPlusPlus
2128
153
                          ? 
diag::warn_cxx98_compat_unicode_literal139
2129
153
                          : 
diag::warn_c99_compat_unicode_literal14
);
2130
1.42M
    else if (Kind == tok::utf8_char_constant)
2131
30
      Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2132
1.42M
  }
2133
1.47M
2134
1.47M
  char C = getAndAdvanceChar(CurPtr, Result);
2135
1.47M
  if (C == '\'') {
2136
28
    if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor2
)
2137
0
      Diag(BufferPtr, diag::ext_empty_character);
2138
28
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2139
28
    return true;
2140
28
  }
2141
1.47M
2142
6.95M
  
while (1.47M
C != '\'') {
2143
5.47M
    // Skip escaped characters.
2144
5.47M
    if (C == '\\')
2145
17.7k
      C = getAndAdvanceChar(CurPtr, Result);
2146
5.47M
2147
5.47M
    if (C == '\n' || 
C == '\r'5.47M
|| // Newline.
2148
5.47M
        
(5.47M
C == 05.47M
&&
CurPtr-1 == BufferEnd19
)) { // End of file.
2149
50
      if (!isLexingRawMode() && 
!LangOpts.AsmPreprocessor6
)
2150
3
        Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2151
50
      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2152
50
      return true;
2153
50
    }
2154
5.47M
2155
5.47M
    if (C == 0) {
2156
14
      if (isCodeCompletionPoint(CurPtr-1)) {
2157
6
        PP->CodeCompleteNaturalLanguage();
2158
6
        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2159
6
        cutOffLexing();
2160
6
        return true;
2161
6
      }
2162
8
2163
8
      NulCharacter = CurPtr-1;
2164
8
    }
2165
5.47M
    C = getAndAdvanceChar(CurPtr, Result);
2166
5.47M
  }
2167
1.47M
2168
1.47M
  // If we are in C++11, lex the optional ud-suffix.
2169
1.47M
  
if (1.47M
getLangOpts().CPlusPlus1.47M
)
2170
258k
    CurPtr = LexUDSuffix(Result, CurPtr, false);
2171
1.47M
2172
1.47M
  // If a nul character existed in the character, warn about it.
2173
1.47M
  if (NulCharacter && 
!isLexingRawMode()8
)
2174
2
    Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2175
1.47M
2176
1.47M
  // Update the location of token as well as BufferPtr.
2177
1.47M
  const char *TokStart = BufferPtr;
2178
1.47M
  FormTokenWithChars(Result, CurPtr, Kind);
2179
1.47M
  Result.setLiteralData(TokStart);
2180
1.47M
  return true;
2181
1.47M
}
2182
2183
/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2184
/// Update BufferPtr to point to the next non-whitespace character and return.
2185
///
2186
/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2187
bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2188
270M
                           bool &TokAtPhysicalStartOfLine) {
2189
270M
  // Whitespace - Skip it, then return the token after the whitespace.
2190
270M
  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2191
270M
2192
270M
  unsigned char Char = *CurPtr;
2193
270M
2194
270M
  // Skip consecutive spaces efficiently.
2195
295M
  while (
true295M
) {
2196
295M
    // Skip horizontal whitespace very aggressively.
2197
1.97G
    while (isHorizontalWhitespace(Char))
2198
1.67G
      Char = *++CurPtr;
2199
295M
2200
295M
    // Otherwise if we have something other than whitespace, we're done.
2201
295M
    if (!isVerticalWhitespace(Char))
2202
270M
      break;
2203
24.6M
2204
24.6M
    if (ParsingPreprocessorDirective) {
2205
5.36k
      // End of preprocessor directive line, let LexTokenInternal handle this.
2206
5.36k
      BufferPtr = CurPtr;
2207
5.36k
      return false;
2208
5.36k
    }
2209
24.6M
2210
24.6M
    // OK, but handle newline.
2211
24.6M
    SawNewline = true;
2212
24.6M
    Char = *++CurPtr;
2213
24.6M
  }
2214
270M
2215
270M
  // If the client wants us to return whitespace, return it now.
2216
270M
  
if (270M
isKeepWhitespaceMode()270M
) {
2217
55.9k
    FormTokenWithChars(Result, CurPtr, tok::unknown);
2218
55.9k
    if (SawNewline) {
2219
55.2k
      IsAtStartOfLine = true;
2220
55.2k
      IsAtPhysicalStartOfLine = true;
2221
55.2k
    }
2222
55.9k
    // FIXME: The next token will not have LeadingSpace set.
2223
55.9k
    return true;
2224
55.9k
  }
2225
270M
2226
270M
  // If this isn't immediately after a newline, there is leading space.
2227
270M
  char PrevChar = CurPtr[-1];
2228
270M
  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2229
270M
2230
270M
  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2231
270M
  if (SawNewline) {
2232
265M
    Result.setFlag(Token::StartOfLine);
2233
265M
    TokAtPhysicalStartOfLine = true;
2234
265M
  }
2235
270M
2236
270M
  BufferPtr = CurPtr;
2237
270M
  return false;
2238
270M
}
2239
2240
/// We have just read the // characters from input.  Skip until we find the
2241
/// newline character that terminates the comment.  Then update BufferPtr and
2242
/// return.
2243
///
2244
/// If we're in KeepCommentMode or any CommentHandler has inserted
2245
/// some tokens, this will store the first token and return true.
2246
bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2247
19.9M
                            bool &TokAtPhysicalStartOfLine) {
2248
19.9M
  // If Line comments aren't explicitly enabled for this language, emit an
2249
19.9M
  // extension warning.
2250
19.9M
  if (!LangOpts.LineComment && 
!isLexingRawMode()5.58k
) {
2251
2.06k
    Diag(BufferPtr, diag::ext_line_comment);
2252
2.06k
2253
2.06k
    // Mark them enabled so we only emit one warning for this translation
2254
2.06k
    // unit.
2255
2.06k
    LangOpts.LineComment = true;
2256
2.06k
  }
2257
19.9M
2258
19.9M
  // Scan over the body of the comment.  The common case, when scanning, is that
2259
19.9M
  // the comment contains normal ascii characters with nothing interesting in
2260
19.9M
  // them.  As such, optimize for this case with the inner loop.
2261
19.9M
  //
2262
19.9M
  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2263
19.9M
  // character that ends the line comment.
2264
19.9M
  char C;
2265
20.0M
  while (
true20.0M
) {
2266
20.0M
    C = *CurPtr;
2267
20.0M
    // Skip over characters in the fast loop.
2268
1.02G
    while (C != 0 &&                // Potentially EOF.
2269
1.02G
           
C != '\n'1.02G
&&
C != '\r'1.00G
) // Newline or DOS-style newline.
2270
1.00G
      C = *++CurPtr;
2271
20.0M
2272
20.0M
    const char *NextLine = CurPtr;
2273
20.0M
    if (C != 0) {
2274
20.0M
      // We found a newline, see if it's escaped.
2275
20.0M
      const char *EscapePtr = CurPtr-1;
2276
20.0M
      bool HasSpace = false;
2277
20.0M
      while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2278
51.8k
        --EscapePtr;
2279
51.8k
        HasSpace = true;
2280
51.8k
      }
2281
20.0M
2282
20.0M
      if (*EscapePtr == '\\')
2283
94.7k
        // Escaped newline.
2284
94.7k
        CurPtr = EscapePtr;
2285
19.9M
      else if (EscapePtr[0] == '/' && 
EscapePtr[-1] == '?'2.47M
&&
2286
19.9M
               
EscapePtr[-2] == '?'6
&&
LangOpts.Trigraphs6
)
2287
3
        // Trigraph-escaped newline.
2288
3
        CurPtr = EscapePtr-2;
2289
19.9M
      else
2290
19.9M
        break; // This is a newline, we're done.
2291
94.7k
2292
94.7k
      // If there was space between the backslash and newline, warn about it.
2293
94.7k
      if (HasSpace && 
!isLexingRawMode()8
)
2294
6
        Diag(EscapePtr, diag::backslash_newline_space);
2295
94.7k
    }
2296
20.0M
2297
20.0M
    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2298
20.0M
    // properly decode the character.  Read it in raw mode to avoid emitting
2299
20.0M
    // diagnostics about things like trigraphs.  If we see an escaped newline,
2300
20.0M
    // we'll handle it below.
2301
20.0M
    const char *OldPtr = CurPtr;
2302
98.5k
    bool OldRawMode = isLexingRawMode();
2303
98.5k
    LexingRawMode = true;
2304
98.5k
    C = getAndAdvanceChar(CurPtr, Result);
2305
98.5k
    LexingRawMode = OldRawMode;
2306
98.5k
2307
98.5k
    // If we only read only one character, then no special handling is needed.
2308
98.5k
    // We're done and can skip forward to the newline.
2309
98.5k
    if (C != 0 && 
CurPtr == OldPtr+194.7k
) {
2310
0
      CurPtr = NextLine;
2311
0
      break;
2312
0
    }
2313
98.5k
2314
98.5k
    // If we read multiple characters, and one of those characters was a \r or
2315
98.5k
    // \n, then we had an escaped newline within the comment.  Emit diagnostic
2316
98.5k
    // unless the next line is also a // comment.
2317
98.5k
    if (CurPtr != OldPtr + 1 && 
C != '/'94.7k
&&
2318
98.5k
        
(4.02k
CurPtr == BufferEnd + 14.02k
||
CurPtr[0] != '/'4.02k
)) {
2319
8.04k
      for (; OldPtr != CurPtr; 
++OldPtr4.02k
)
2320
8.04k
        if (OldPtr[0] == '\n' || 
OldPtr[0] == '\r'4.02k
) {
2321
4.01k
          // Okay, we found a // comment that ends in a newline, if the next
2322
4.01k
          // line is also a // comment, but has spaces, don't emit a diagnostic.
2323
4.01k
          if (isWhitespace(C)) {
2324
3.84k
            const char *ForwardPtr = CurPtr;
2325
48.9k
            while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2326
45.1k
              ++ForwardPtr;
2327
3.84k
            if (ForwardPtr[0] == '/' && 
ForwardPtr[1] == '/'3.45k
)
2328
3.45k
              break;
2329
563
          }
2330
563
2331
563
          if (!isLexingRawMode())
2332
513
            Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2333
563
          break;
2334
563
        }
2335
4.01k
    }
2336
98.5k
2337
98.5k
    if (
C == '\r'98.5k
|| C == '\n' ||
CurPtr == BufferEnd + 198.5k
) {
2338
3.76k
      --CurPtr;
2339
3.76k
      break;
2340
3.76k
    }
2341
94.7k
2342
94.7k
    if (C == '\0' && 
isCodeCompletionPoint(CurPtr-1)9
) {
2343
8
      PP->CodeCompleteNaturalLanguage();
2344
8
      cutOffLexing();
2345
8
      return false;
2346
8
    }
2347
94.7k
  }
2348
19.9M
2349
19.9M
  // Found but did not consume the newline.  Notify comment handlers about the
2350
19.9M
  // comment unless we're in a #if 0 block.
2351
19.9M
  
if (19.9M
PP19.9M
&&
!isLexingRawMode()19.7M
&&
2352
19.9M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2353
14.2M
                                            getSourceLocation(CurPtr)))) {
2354
0
    BufferPtr = CurPtr;
2355
0
    return true; // A token has to be returned.
2356
0
  }
2357
19.9M
2358
19.9M
  // If we are returning comments as tokens, return this comment as a token.
2359
19.9M
  if (inKeepCommentMode())
2360
23.3k
    return SaveLineComment(Result, CurPtr);
2361
19.8M
2362
19.8M
  // If we are inside a preprocessor directive and we see the end of line,
2363
19.8M
  // return immediately, so that the lexer can return this as an EOD token.
2364
19.8M
  if (ParsingPreprocessorDirective || 
CurPtr == BufferEnd19.2M
) {
2365
686k
    BufferPtr = CurPtr;
2366
686k
    return false;
2367
686k
  }
2368
19.2M
2369
19.2M
  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2370
19.2M
  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2371
19.2M
  // contribute to another token), it isn't needed for correctness.  Note that
2372
19.2M
  // this is ok even in KeepWhitespaceMode, because we would have returned the
2373
19.2M
  /// comment above in that mode.
2374
19.2M
  ++CurPtr;
2375
19.2M
2376
19.2M
  // The next returned token is at the start of the line.
2377
19.2M
  Result.setFlag(Token::StartOfLine);
2378
19.2M
  TokAtPhysicalStartOfLine = true;
2379
19.2M
  // No leading whitespace seen so far.
2380
19.2M
  Result.clearFlag(Token::LeadingSpace);
2381
19.2M
  BufferPtr = CurPtr;
2382
19.2M
  return false;
2383
19.2M
}
2384
2385
/// If in save-comment mode, package up this Line comment in an appropriate
2386
/// way and return it.
2387
23.3k
bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2388
23.3k
  // If we're not in a preprocessor directive, just return the // comment
2389
23.3k
  // directly.
2390
23.3k
  FormTokenWithChars(Result, CurPtr, tok::comment);
2391
23.3k
2392
23.3k
  if (!ParsingPreprocessorDirective || 
LexingRawMode2
)
2393
23.3k
    return true;
2394
2
2395
2
  // If this Line-style comment is in a macro definition, transmogrify it into
2396
2
  // a C-style block comment.
2397
2
  bool Invalid = false;
2398
2
  std::string Spelling = PP->getSpelling(Result, &Invalid);
2399
2
  if (Invalid)
2400
0
    return true;
2401
2
2402
2
  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2403
2
  Spelling[1] = '*';   // Change prefix to "/*".
2404
2
  Spelling += "*/";    // add suffix.
2405
2
2406
2
  Result.setKind(tok::comment);
2407
2
  PP->CreateString(Spelling, Result,
2408
2
                   Result.getLocation(), Result.getLocation());
2409
2
  return true;
2410
2
}
2411
2412
/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2413
/// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2414
/// a diagnostic if so.  We know that the newline is inside of a block comment.
2415
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2416
70.8k
                                                  Lexer *L) {
2417
70.8k
  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2418
70.8k
2419
70.8k
  // Back up off the newline.
2420
70.8k
  --CurPtr;
2421
70.8k
2422
70.8k
  // If this is a two-character newline sequence, skip the other character.
2423
70.8k
  if (CurPtr[0] == '\n' || 
CurPtr[0] == '\r'10.7k
) {
2424
60.1k
    // \n\n or \r\r -> not escaped newline.
2425
60.1k
    if (CurPtr[0] == CurPtr[1])
2426
60.1k
      return false;
2427
0
    // \n\r or \r\n -> skip the newline.
2428
0
    --CurPtr;
2429
0
  }
2430
70.8k
2431
70.8k
  // If we have horizontal whitespace, skip over it.  We allow whitespace
2432
70.8k
  // between the slash and newline.
2433
70.8k
  bool HasSpace = false;
2434
10.7k
  while (isHorizontalWhitespace(*CurPtr) || 
*CurPtr == 010.7k
) {
2435
33
    --CurPtr;
2436
33
    HasSpace = true;
2437
33
  }
2438
10.7k
2439
10.7k
  // If we have a slash, we know this is an escaped newline.
2440
10.7k
  if (*CurPtr == '\\') {
2441
8
    if (CurPtr[-1] != '*') 
return false0
;
2442
10.7k
  } else {
2443
10.7k
    // It isn't a slash, is it the ?? / trigraph?
2444
10.7k
    if (CurPtr[0] != '/' || 
CurPtr[-1] != '?'1.70k
||
CurPtr[-2] != '?'5
||
2445
10.7k
        
CurPtr[-3] != '*'5
)
2446
10.7k
      return false;
2447
5
2448
5
    // This is the trigraph ending the comment.  Emit a stern warning!
2449
5
    CurPtr -= 2;
2450
5
2451
5
    // If no trigraphs are enabled, warn that we ignored this trigraph and
2452
5
    // ignore this * character.
2453
5
    if (!L->getLangOpts().Trigraphs) {
2454
0
      if (!L->isLexingRawMode())
2455
0
        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2456
0
      return false;
2457
0
    }
2458
5
    if (!L->isLexingRawMode())
2459
5
      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2460
5
  }
2461
10.7k
2462
10.7k
  // Warn about having an escaped newline between the */ characters.
2463
10.7k
  
if (13
!L->isLexingRawMode()13
)
2464
10
    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2465
13
2466
13
  // If there was space between the backslash and newline, warn about it.
2467
13
  if (HasSpace && !L->isLexingRawMode())
2468
10
    L->Diag(CurPtr, diag::backslash_newline_space);
2469
13
2470
13
  return true;
2471
10.7k
}
2472
2473
#ifdef __SSE2__
2474
#include <emmintrin.h>
2475
#elif __ALTIVEC__
2476
#include <altivec.h>
2477
#undef bool
2478
#endif
2479
2480
/// We have just read from input the / and * characters that started a comment.
2481
/// Read until we find the * and / characters that terminate the comment.
2482
/// Note that we don't bother decoding trigraphs or escaped newlines in block
2483
/// comments, because they cannot cause the comment to end.  The only thing
2484
/// that can happen is the comment could end with an escaped newline between
2485
/// the terminating * and /.
2486
///
2487
/// If we're in KeepCommentMode or any CommentHandler has inserted
2488
/// some tokens, this will store the first token and return true.
2489
bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2490
44.3M
                             bool &TokAtPhysicalStartOfLine) {
2491
44.3M
  // Scan one character past where we should, looking for a '/' character.  Once
2492
44.3M
  // we find it, check to see if it was preceded by a *.  This common
2493
44.3M
  // optimization helps people who like to put a lot of * characters in their
2494
44.3M
  // comments.
2495
44.3M
2496
44.3M
  // The first character we get with newlines and trigraphs skipped to handle
2497
44.3M
  // the degenerate /*/ case below correctly if the * has an escaped newline
2498
44.3M
  // after it.
2499
44.3M
  unsigned CharSize;
2500
44.3M
  unsigned char C = getCharAndSize(CurPtr, CharSize);
2501
44.3M
  CurPtr += CharSize;
2502
44.3M
  if (C == 0 && 
CurPtr == BufferEnd+12
) {
2503
1
    if (!isLexingRawMode())
2504
0
      Diag(BufferPtr, diag::err_unterminated_block_comment);
2505
1
    --CurPtr;
2506
1
2507
1
    // KeepWhitespaceMode should return this broken comment as a token.  Since
2508
1
    // it isn't a well formed comment, just return it as an 'unknown' token.
2509
1
    if (isKeepWhitespaceMode()) {
2510
0
      FormTokenWithChars(Result, CurPtr, tok::unknown);
2511
0
      return true;
2512
0
    }
2513
1
2514
1
    BufferPtr = CurPtr;
2515
1
    return false;
2516
1
  }
2517
44.3M
2518
44.3M
  // Check to see if the first character after the '/*' is another /.  If so,
2519
44.3M
  // then this slash does not end the block comment, it is part of it.
2520
44.3M
  if (C == '/')
2521
577
    C = *CurPtr++;
2522
44.3M
2523
52.1M
  while (true) {
2524
52.1M
    // Skip over all non-interesting characters until we find end of buffer or a
2525
52.1M
    // (probably ending) '/' character.
2526
52.1M
    if (CurPtr + 24 < BufferEnd &&
2527
52.1M
        // If there is a code-completion point avoid the fast scan because it
2528
52.1M
        // doesn't check for '\0'.
2529
52.1M
        
!(51.6M
PP51.6M
&&
PP->getCodeCompletionFileLoc() == FileLoc51.6M
)) {
2530
51.6M
      // While not aligned to a 16-byte boundary.
2531
411M
      while (C != '/' && 
((intptr_t)CurPtr & 0x0F) != 0407M
)
2532
359M
        C = *CurPtr++;
2533
51.6M
2534
51.6M
      if (C == '/') 
goto FoundSlash4.04M
;
2535
47.6M
2536
47.6M
#ifdef __SSE2__
2537
47.6M
      __m128i Slashes = _mm_set1_epi8('/');
2538
407M
      while (CurPtr+16 <= BufferEnd) {
2539
407M
        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2540
407M
                                    Slashes));
2541
407M
        if (cmp != 0) {
2542
47.5M
          // Adjust the pointer to point directly after the first slash. It's
2543
47.5M
          // not necessary to set C here, it will be overwritten at the end of
2544
47.5M
          // the outer loop.
2545
47.5M
          CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
2546
47.5M
          goto FoundSlash;
2547
47.5M
        }
2548
359M
        CurPtr += 16;
2549
359M
      }
2550
#elif __ALTIVEC__
2551
      __vector unsigned char Slashes = {
2552
        '/', '/', '/', '/',  '/', '/', '/', '/',
2553
        '/', '/', '/', '/',  '/', '/', '/', '/'
2554
      };
2555
      while (CurPtr + 16 <= BufferEnd &&
2556
             !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes))
2557
        CurPtr += 16;
2558
#else
2559
      // Scan for '/' quickly.  Many block comments are very large.
2560
      while (CurPtr[0] != '/' &&
2561
             CurPtr[1] != '/' &&
2562
             CurPtr[2] != '/' &&
2563
             CurPtr[3] != '/' &&
2564
             CurPtr+4 < BufferEnd) {
2565
        CurPtr += 4;
2566
      }
2567
#endif
2568
2569
47.6M
      // It has to be one of the bytes scanned, increment to it and read one.
2570
47.6M
      C = *CurPtr++;
2571
101k
    }
2572
52.1M
2573
52.1M
    // Loop to scan the remainder.
2574
52.1M
    
while (540k
C != '/'7.93M
&&
C != '\0'7.38M
)
2575
7.38M
      C = *CurPtr++;
2576
540k
2577
540k
    if (C == '/') {
2578
52.1M
  FoundSlash:
2579
52.1M
      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2580
44.3M
        break;
2581
7.76M
2582
7.76M
      if ((CurPtr[-2] == '\n' || 
CurPtr[-2] == '\r'7.69M
)) {
2583
70.8k
        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2584
13
          // We found the final */, though it had an escaped newline between the
2585
13
          // * and /.  We're done!
2586
13
          break;
2587
13
        }
2588
7.76M
      }
2589
7.76M
      if (CurPtr[0] == '*' && 
CurPtr[1] != '/'576
) {
2590
10
        // If this is a /* inside of the comment, emit a warning.  Don't do this
2591
10
        // if this is a /*/, which will end the comment.  This misses cases with
2592
10
        // embedded escaped newlines, but oh well.
2593
10
        if (!isLexingRawMode())
2594
9
          Diag(CurPtr-1, diag::warn_nested_block_comment);
2595
10
      }
2596
7.76M
    } else 
if (14
C == 014
&&
CurPtr == BufferEnd+114
) {
2597
2
      if (!isLexingRawMode())
2598
2
        Diag(BufferPtr, diag::err_unterminated_block_comment);
2599
2
      // Note: the user probably forgot a */.  We could continue immediately
2600
2
      // after the /*, but this would involve lexing a lot of what really is the
2601
2
      // comment, which surely would confuse the parser.
2602
2
      --CurPtr;
2603
2
2604
2
      // KeepWhitespaceMode should return this broken comment as a token.  Since
2605
2
      // it isn't a well formed comment, just return it as an 'unknown' token.
2606
2
      if (isKeepWhitespaceMode()) {
2607
0
        FormTokenWithChars(Result, CurPtr, tok::unknown);
2608
0
        return true;
2609
0
      }
2610
2
2611
2
      BufferPtr = CurPtr;
2612
2
      return false;
2613
12
    } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2614
9
      PP->CodeCompleteNaturalLanguage();
2615
9
      cutOffLexing();
2616
9
      return false;
2617
9
    }
2618
7.76M
2619
7.76M
    C = *CurPtr++;
2620
7.76M
  }
2621
44.3M
2622
44.3M
  // Notify comment handlers about the comment unless we're in a #if 0 block.
2623
44.3M
  
if (44.3M
PP44.3M
&&
!isLexingRawMode()44.3M
&&
2624
44.3M
      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2625
39.9M
                                            getSourceLocation(CurPtr)))) {
2626
0
    BufferPtr = CurPtr;
2627
0
    return true; // A token has to be returned.
2628
0
  }
2629
44.3M
2630
44.3M
  // If we are returning comments as tokens, return this comment as a token.
2631
44.3M
  if (inKeepCommentMode()) {
2632
1.81k
    FormTokenWithChars(Result, CurPtr, tok::comment);
2633
1.81k
    return true;
2634
1.81k
  }
2635
44.3M
2636
44.3M
  // It is common for the tokens immediately after a /**/ comment to be
2637
44.3M
  // whitespace.  Instead of going through the big switch, handle it
2638
44.3M
  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2639
44.3M
  // have already returned above with the comment as a token.
2640
44.3M
  if (isHorizontalWhitespace(*CurPtr)) {
2641
165k
    SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2642
165k
    return false;
2643
165k
  }
2644
44.1M
2645
44.1M
  // Otherwise, just return so that the next character will be lexed as a token.
2646
44.1M
  BufferPtr = CurPtr;
2647
44.1M
  Result.setFlag(Token::LeadingSpace);
2648
44.1M
  return false;
2649
44.1M
}
2650
2651
//===----------------------------------------------------------------------===//
2652
// Primary Lexing Entry Points
2653
//===----------------------------------------------------------------------===//
2654
2655
/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2656
/// uninterpreted string.  This switches the lexer out of directive mode.
2657
82.9k
void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
2658
82.9k
  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2659
82.9k
         "Must be in a preprocessing directive!");
2660
82.9k
  Token Tmp;
2661
82.9k
  Tmp.startToken();
2662
82.9k
2663
82.9k
  // CurPtr - Cache BufferPtr in an automatic variable.
2664
82.9k
  const char *CurPtr = BufferPtr;
2665
1.66M
  while (true) {
2666
1.66M
    char Char = getAndAdvanceChar(CurPtr, Tmp);
2667
1.66M
    switch (Char) {
2668
1.58M
    default:
2669
1.58M
      if (Result)
2670
3.51k
        Result->push_back(Char);
2671
1.58M
      break;
2672
18
    case 0:  // Null.
2673
18
      // Found end of file?
2674
18
      if (CurPtr-1 != BufferEnd) {
2675
18
        if (isCodeCompletionPoint(CurPtr-1)) {
2676
18
          PP->CodeCompleteNaturalLanguage();
2677
18
          cutOffLexing();
2678
18
          return;
2679
18
        }
2680
0
2681
0
        // Nope, normal character, continue.
2682
0
        if (Result)
2683
0
          Result->push_back(Char);
2684
0
        break;
2685
0
      }
2686
0
      // FALL THROUGH.
2687
0
      LLVM_FALLTHROUGH;
2688
82.9k
    case '\r':
2689
82.9k
    case '\n':
2690
82.9k
      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2691
82.9k
      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2692
82.9k
      BufferPtr = CurPtr-1;
2693
82.9k
2694
82.9k
      // Next, lex the character, which should handle the EOD transition.
2695
82.9k
      Lex(Tmp);
2696
82.9k
      if (Tmp.is(tok::code_completion)) {
2697
0
        if (PP)
2698
0
          PP->CodeCompleteNaturalLanguage();
2699
0
        Lex(Tmp);
2700
0
      }
2701
82.9k
      assert(Tmp.is(tok::eod) && "Unexpected token!");
2702
82.9k
2703
82.9k
      // Finally, we're done;
2704
82.9k
      return;
2705
1.66M
    }
2706
1.66M
  }
2707
82.9k
}
2708
2709
/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2710
/// condition, reporting diagnostics and handling other edge cases as required.
2711
/// This returns true if Result contains a token, false if PP.Lex should be
2712
/// called again.
2713
1.62M
bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2714
1.62M
  // If we hit the end of the file while parsing a preprocessor directive,
2715
1.62M
  // end the preprocessor directive first.  The next token returned will
2716
1.62M
  // then be the end of file.
2717
1.62M
  if (ParsingPreprocessorDirective) {
2718
1.75k
    // Done parsing the "line".
2719
1.75k
    ParsingPreprocessorDirective = false;
2720
1.75k
    // Update the location of token as well as BufferPtr.
2721
1.75k
    FormTokenWithChars(Result, CurPtr, tok::eod);
2722
1.75k
2723
1.75k
    // Restore comment saving mode, in case it was disabled for directive.
2724
1.75k
    if (PP)
2725
1.75k
      resetExtendedTokenMode();
2726
1.75k
    return true;  // Have a token.
2727
1.75k
  }
2728
1.61M
2729
1.61M
  // If we are in raw mode, return this event as an EOF token.  Let the caller
2730
1.61M
  // that put us in raw mode handle the event.
2731
1.61M
  if (isLexingRawMode()) {
2732
95.3k
    Result.startToken();
2733
95.3k
    BufferPtr = BufferEnd;
2734
95.3k
    FormTokenWithChars(Result, BufferEnd, tok::eof);
2735
95.3k
    return true;
2736
95.3k
  }
2737
1.52M
2738
1.52M
  if (PP->isRecordingPreamble() && 
PP->isInPrimaryFile()251
) {
2739
89
    PP->setRecordedPreambleConditionalStack(ConditionalStack);
2740
89
    ConditionalStack.clear();
2741
89
  }
2742
1.52M
2743
1.52M
  // Issue diagnostics for unterminated #if and missing newline.
2744
1.52M
2745
1.52M
  // If we are in a #if directive, emit an error.
2746
1.52M
  while (!ConditionalStack.empty()) {
2747
8
    if (PP->getCodeCompletionFileLoc() != FileLoc)
2748
8
      PP->Diag(ConditionalStack.back().IfLoc,
2749
8
               diag::err_pp_unterminated_conditional);
2750
8
    ConditionalStack.pop_back();
2751
8
  }
2752
1.52M
2753
1.52M
  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2754
1.52M
  // a pedwarn.
2755
1.52M
  if (CurPtr != BufferStart && 
(1.51M
CurPtr[-1] != '\n'1.51M
&&
CurPtr[-1] != '\r'12.6k
)) {
2756
12.6k
    DiagnosticsEngine &Diags = PP->getDiagnostics();
2757
12.6k
    SourceLocation EndLoc = getSourceLocation(BufferEnd);
2758
12.6k
    unsigned DiagID;
2759
12.6k
2760
12.6k
    if (LangOpts.CPlusPlus11) {
2761
9.90k
      // C++11 [lex.phases] 2.2 p2
2762
9.90k
      // Prefer the C++98 pedantic compatibility warning over the generic,
2763
9.90k
      // non-extension, user-requested "missing newline at EOF" warning.
2764
9.90k
      if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
2765
2
        DiagID = diag::warn_cxx98_compat_no_newline_eof;
2766
9.90k
      } else {
2767
9.90k
        DiagID = diag::warn_no_newline_eof;
2768
9.90k
      }
2769
9.90k
    } else {
2770
2.70k
      DiagID = diag::ext_no_newline_eof;
2771
2.70k
    }
2772
12.6k
2773
12.6k
    Diag(BufferEnd, DiagID)
2774
12.6k
      << FixItHint::CreateInsertion(EndLoc, "\n");
2775
12.6k
  }
2776
1.52M
2777
1.52M
  BufferPtr = CurPtr;
2778
1.52M
2779
1.52M
  // Finally, let the preprocessor handle this.
2780
1.52M
  return PP->HandleEndOfFile(Result, isPragmaLexer());
2781
1.52M
}
2782
2783
/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2784
/// the specified lexer will return a tok::l_paren token, 0 if it is something
2785
/// else and 2 if there are no more tokens in the buffer controlled by the
2786
/// lexer.
2787
6.04M
unsigned Lexer::isNextPPTokenLParen() {
2788
6.04M
  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2789
6.04M
2790
6.04M
  // Switch to 'skipping' mode.  This will ensure that we can lex a token
2791
6.04M
  // without emitting diagnostics, disables macro expansion, and will cause EOF
2792
6.04M
  // to return an EOF token instead of popping the include stack.
2793
6.04M
  LexingRawMode = true;
2794
6.04M
2795
6.04M
  // Save state that can be changed while lexing so that we can restore it.
2796
6.04M
  const char *TmpBufferPtr = BufferPtr;
2797
6.04M
  bool inPPDirectiveMode = ParsingPreprocessorDirective;
2798
6.04M
  bool atStartOfLine = IsAtStartOfLine;
2799
6.04M
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
2800
6.04M
  bool leadingSpace = HasLeadingSpace;
2801
6.04M
2802
6.04M
  Token Tok;
2803
6.04M
  Lex(Tok);
2804
6.04M
2805
6.04M
  // Restore state that may have changed.
2806
6.04M
  BufferPtr = TmpBufferPtr;
2807
6.04M
  ParsingPreprocessorDirective = inPPDirectiveMode;
2808
6.04M
  HasLeadingSpace = leadingSpace;
2809
6.04M
  IsAtStartOfLine = atStartOfLine;
2810
6.04M
  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
2811
6.04M
2812
6.04M
  // Restore the lexer back to non-skipping mode.
2813
6.04M
  LexingRawMode = false;
2814
6.04M
2815
6.04M
  if (Tok.is(tok::eof))
2816
3
    return 2;
2817
6.04M
  return Tok.is(tok::l_paren);
2818
6.04M
}
2819
2820
/// Find the end of a version control conflict marker.
2821
static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2822
10
                                   ConflictMarkerKind CMK) {
2823
10
  const char *Terminator = CMK == CMK_Perforce ? 
"<<<<\n"5
:
">>>>>>>"5
;
2824
10
  size_t TermLen = CMK == CMK_Perforce ? 
55
:
75
;
2825
10
  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
2826
10
  size_t Pos = RestOfBuffer.find(Terminator);
2827
11
  while (Pos != StringRef::npos) {
2828
8
    // Must occur at start of line.
2829
8
    if (Pos == 0 ||
2830
8
        
(7
RestOfBuffer[Pos - 1] != '\r'7
&&
RestOfBuffer[Pos - 1] != '\n'7
)) {
2831
1
      RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2832
1
      Pos = RestOfBuffer.find(Terminator);
2833
1
      continue;
2834
1
    }
2835
7
    return RestOfBuffer.data()+Pos;
2836
7
  }
2837
10
  
return nullptr3
;
2838
10
}
2839
2840
/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2841
/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2842
/// and recover nicely.  This returns true if it is a conflict marker and false
2843
/// if not.
2844
7.34k
bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2845
7.34k
  // Only a conflict marker if it starts at the beginning of a line.
2846
7.34k
  if (CurPtr != BufferStart &&
2847
7.34k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'7.32k
)
2848
7.32k
    return false;
2849
28
2850
28
  // Check to see if we have <<<<<<< or >>>>.
2851
28
  if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") &&
2852
28
      
!StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")19
)
2853
14
    return false;
2854
14
2855
14
  // If we have a situation where we don't care about conflict markers, ignore
2856
14
  // it.
2857
14
  if (CurrentConflictMarkerState || isLexingRawMode())
2858
9
    return false;
2859
5
2860
5
  ConflictMarkerKind Kind = *CurPtr == '<' ? 
CMK_Normal3
:
CMK_Perforce2
;
2861
5
2862
5
  // Check to see if there is an ending marker somewhere in the buffer at the
2863
5
  // start of a line to terminate this conflict marker.
2864
5
  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2865
4
    // We found a match.  We are really in a conflict marker.
2866
4
    // Diagnose this, and ignore to the end of line.
2867
4
    Diag(CurPtr, diag::err_conflict_marker);
2868
4
    CurrentConflictMarkerState = Kind;
2869
4
2870
4
    // Skip ahead to the end of line.  We know this exists because the
2871
4
    // end-of-conflict marker starts with \r or \n.
2872
76
    while (*CurPtr != '\r' && *CurPtr != '\n') {
2873
72
      assert(CurPtr != BufferEnd && "Didn't find end of line");
2874
72
      ++CurPtr;
2875
72
    }
2876
4
    BufferPtr = CurPtr;
2877
4
    return true;
2878
4
  }
2879
1
2880
1
  // No end of conflict marker found.
2881
1
  return false;
2882
1
}
2883
2884
/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2885
/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2886
/// is the end of a conflict marker.  Handle it by ignoring up until the end of
2887
/// the line.  This returns true if it is a conflict marker and false if not.
2888
7.40k
bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2889
7.40k
  // Only a conflict marker if it starts at the beginning of a line.
2890
7.40k
  if (CurPtr != BufferStart &&
2891
7.40k
      CurPtr[-1] != '\n' && 
CurPtr[-1] != '\r'7.35k
)
2892
7.35k
    return false;
2893
45
2894
45
  // If we have a situation where we don't care about conflict markers, ignore
2895
45
  // it.
2896
45
  if (!CurrentConflictMarkerState || 
isLexingRawMode()5
)
2897
40
    return false;
2898
5
2899
5
  // Check to see if we have the marker (4 characters in a row).
2900
20
  
for (unsigned i = 1; 5
i != 4;
++i15
)
2901
15
    if (CurPtr[i] != CurPtr[0])
2902
0
      return false;
2903
5
2904
5
  // If we do have it, search for the end of the conflict marker.  This could
2905
5
  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
2906
5
  // be the end of conflict marker.
2907
5
  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2908
3
                                        CurrentConflictMarkerState)) {
2909
3
    CurPtr = End;
2910
3
2911
3
    // Skip ahead to the end of line.
2912
37
    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2913
34
      ++CurPtr;
2914
3
2915
3
    BufferPtr = CurPtr;
2916
3
2917
3
    // No longer in the conflict marker.
2918
3
    CurrentConflictMarkerState = CMK_None;
2919
3
    return true;
2920
3
  }
2921
2
2922
2
  return false;
2923
2
}
2924
2925
static const char *findPlaceholderEnd(const char *CurPtr,
2926
43
                                      const char *BufferEnd) {
2927
43
  if (CurPtr == BufferEnd)
2928
0
    return nullptr;
2929
43
  BufferEnd -= 1; // Scan until the second last character.
2930
404
  for (; CurPtr != BufferEnd; 
++CurPtr361
) {
2931
404
    if (CurPtr[0] == '#' && 
CurPtr[1] == '>'43
)
2932
43
      return CurPtr + 2;
2933
404
  }
2934
43
  
return nullptr0
;
2935
43
}
2936
2937
45
bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
2938
45
  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
2939
45
  if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || 
LexingRawMode44
)
2940
2
    return false;
2941
43
  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
2942
43
  if (!End)
2943
0
    return false;
2944
43
  const char *Start = CurPtr - 1;
2945
43
  if (!LangOpts.AllowEditorPlaceholders)
2946
22
    Diag(Start, diag::err_placeholder_in_source);
2947
43
  Result.startToken();
2948
43
  FormTokenWithChars(Result, End, tok::raw_identifier);
2949
43
  Result.setRawIdentifierData(Start);
2950
43
  PP->LookUpIdentifierInfo(Result);
2951
43
  Result.setFlag(Token::IsEditorPlaceholder);
2952
43
  BufferPtr = End;
2953
43
  return true;
2954
43
}
2955
2956
371M
bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2957
371M
  if (
PP371M
&& PP->isCodeCompletionEnabled()) {
2958
998k
    SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2959
998k
    return Loc == PP->getCodeCompletionLoc();
2960
998k
  }
2961
370M
2962
370M
  return false;
2963
370M
}
2964
2965
uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
2966
941
                           Token *Result) {
2967
941
  unsigned CharSize;
2968
941
  char Kind = getCharAndSize(StartPtr, CharSize);
2969
941
2970
941
  unsigned NumHexDigits;
2971
941
  if (Kind == 'u')
2972
259
    NumHexDigits = 4;
2973
682
  else if (Kind == 'U')
2974
36
    NumHexDigits = 8;
2975
646
  else
2976
646
    return 0;
2977
295
2978
295
  if (!LangOpts.CPlusPlus && 
!LangOpts.C99142
) {
2979
5
    if (Result && 
!isLexingRawMode()3
)
2980
3
      Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
2981
5
    return 0;
2982
5
  }
2983
290
2984
290
  const char *CurPtr = StartPtr + CharSize;
2985
290
  const char *KindLoc = &CurPtr[-1];
2986
290
2987
290
  uint32_t CodePoint = 0;
2988
1.52k
  for (unsigned i = 0; i < NumHexDigits; 
++i1.23k
) {
2989
1.25k
    char C = getCharAndSize(CurPtr, CharSize);
2990
1.25k
2991
1.25k
    unsigned Value = llvm::hexDigitValue(C);
2992
1.25k
    if (Value == -1U) {
2993
21
      if (Result && 
!isLexingRawMode()18
) {
2994
18
        if (i == 0) {
2995
6
          Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
2996
6
            << StringRef(KindLoc, 1);
2997
12
        } else {
2998
12
          Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
2999
12
3000
12
          // If the user wrote \U1234, suggest a fixit to \u.
3001
12
          if (i == 4 && 
NumHexDigits == 83
) {
3002
3
            CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3003
3
            Diag(KindLoc, diag::note_ucn_four_not_eight)
3004
3
              << FixItHint::CreateReplacement(URange, "u");
3005
3
          }
3006
12
        }
3007
18
      }
3008
21
3009
21
      return 0;
3010
21
    }
3011
1.23k
3012
1.23k
    CodePoint <<= 4;
3013
1.23k
    CodePoint += Value;
3014
1.23k
3015
1.23k
    CurPtr += CharSize;
3016
1.23k
  }
3017
290
3018
290
  
if (269
Result269
) {
3019
120
    Result->setFlag(Token::HasUCN);
3020
120
    if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
3021
0
      StartPtr = CurPtr;
3022
120
    else
3023
784
      
while (120
StartPtr != CurPtr)
3024
664
        (void)getAndAdvanceChar(StartPtr, *Result);
3025
149
  } else {
3026
149
    StartPtr = CurPtr;
3027
149
  }
3028
269
3029
269
  // Don't apply C family restrictions to UCNs in assembly mode
3030
269
  if (LangOpts.AsmPreprocessor)
3031
6
    return CodePoint;
3032
263
3033
263
  // C99 6.4.3p2: A universal character name shall not specify a character whose
3034
263
  //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
3035
263
  //   0060 (`), nor one in the range D800 through DFFF inclusive.)
3036
263
  // C++11 [lex.charset]p2: If the hexadecimal value for a
3037
263
  //   universal-character-name corresponds to a surrogate code point (in the
3038
263
  //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3039
263
  //   if the hexadecimal value for a universal-character-name outside the
3040
263
  //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3041
263
  //   string literal corresponds to a control character (in either of the
3042
263
  //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3043
263
  //   basic source character set, the program is ill-formed.
3044
263
  if (CodePoint < 0xA0) {
3045
27
    if (CodePoint == 0x24 || 
CodePoint == 0x4024
||
CodePoint == 0x6024
)
3046
3
      return CodePoint;
3047
24
3048
24
    // We don't use isLexingRawMode() here because we need to warn about bad
3049
24
    // UCNs even when skipping preprocessing tokens in a #if block.
3050
24
    if (Result && 
PP23
) {
3051
23
      if (CodePoint < 0x20 || 
CodePoint >= 0x7F15
)
3052
16
        Diag(BufferPtr, diag::err_ucn_control_character);
3053
7
      else {
3054
7
        char C = static_cast<char>(CodePoint);
3055
7
        Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3056
7
      }
3057
23
    }
3058
24
3059
24
    return 0;
3060
236
  } else if (CodePoint >= 0xD800 && 
CodePoint <= 0xDFFF30
) {
3061
4
    // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3062
4
    // We don't use isLexingRawMode() here because we need to diagnose bad
3063
4
    // UCNs even when skipping preprocessing tokens in a #if block.
3064
4
    if (Result && PP) {
3065
4
      if (LangOpts.CPlusPlus && 
!LangOpts.CPlusPlus112
)
3066
1
        Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3067
3
      else
3068
3
        Diag(BufferPtr, diag::err_ucn_escape_invalid);
3069
4
    }
3070
4
    return 0;
3071
4
  }
3072
232
3073
232
  return CodePoint;
3074
232
}
3075
3076
bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3077
194
                                   const char *CurPtr) {
3078
194
  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
3079
194
      UnicodeWhitespaceCharRanges);
3080
194
  if (!isLexingRawMode() && 
!PP->isPreprocessedOutput()169
&&
3081
194
      
UnicodeWhitespaceChars.contains(C)155
) {
3082
6
    Diag(BufferPtr, diag::ext_unicode_whitespace)
3083
6
      << makeCharRange(*this, BufferPtr, CurPtr);
3084
6
3085
6
    Result.setFlag(Token::LeadingSpace);
3086
6
    return true;
3087
6
  }
3088
188
  return false;
3089
188
}
3090
3091
188
bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
3092
188
  if (isAllowedIDChar(C, LangOpts) && 
isAllowedInitiallyIDChar(C, LangOpts)125
) {
3093
119
    if (!isLexingRawMode() && 
!ParsingPreprocessorDirective106
&&
3094
119
        
!PP->isPreprocessedOutput()91
) {
3095
82
      maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
3096
82
                                makeCharRange(*this, BufferPtr, CurPtr),
3097
82
                                /*IsFirst=*/true);
3098
82
      maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
3099
82
                                 makeCharRange(*this, BufferPtr, CurPtr));
3100
82
    }
3101
119
3102
119
    MIOpt.ReadToken();
3103
119
    return LexIdentifier(Result, CurPtr);
3104
119
  }
3105
69
3106
69
  if (!isLexingRawMode() && 
!ParsingPreprocessorDirective57
&&
3107
69
      
!PP->isPreprocessedOutput()45
&&
3108
69
      
!isASCII(*BufferPtr)41
&&
!isAllowedIDChar(C, LangOpts)25
) {
3109
22
    // Non-ASCII characters tend to creep into source code unintentionally.
3110
22
    // Instead of letting the parser complain about the unknown token,
3111
22
    // just drop the character.
3112
22
    // Note that we can /only/ do this when the non-ASCII character is actually
3113
22
    // spelled as Unicode, not written as a UCN. The standard requires that
3114
22
    // we not throw away any possible preprocessor tokens, but there's a
3115
22
    // loophole in the mapping of Unicode characters to basic character set
3116
22
    // characters that allows us to map these particular characters to, say,
3117
22
    // whitespace.
3118
22
    Diag(BufferPtr, diag::err_non_ascii)
3119
22
      << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
3120
22
3121
22
    BufferPtr = CurPtr;
3122
22
    return false;
3123
22
  }
3124
47
3125
47
  // Otherwise, we have an explicit UCN or a character that's unlikely to show
3126
47
  // up by accident.
3127
47
  MIOpt.ReadToken();
3128
47
  FormTokenWithChars(Result, CurPtr, tok::unknown);
3129
47
  return true;
3130
47
}
3131
3132
18.1M
void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3133
18.1M
  IsAtStartOfLine = Result.isAtStartOfLine();
3134
18.1M
  HasLeadingSpace = Result.hasLeadingSpace();
3135
18.1M
  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3136
18.1M
  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3137
18.1M
}
3138
3139
1.98G
bool Lexer::Lex(Token &Result) {
3140
1.98G
  // Start a new token.
3141
1.98G
  Result.startToken();
3142
1.98G
3143
1.98G
  // Set up misc whitespace flags for LexTokenInternal.
3144
1.98G
  if (IsAtStartOfLine) {
3145
99.8M
    Result.setFlag(Token::StartOfLine);
3146
99.8M
    IsAtStartOfLine = false;
3147
99.8M
  }
3148
1.98G
3149
1.98G
  if (HasLeadingSpace) {
3150
812k
    Result.setFlag(Token::LeadingSpace);
3151
812k
    HasLeadingSpace = false;
3152
812k
  }
3153
1.98G
3154
1.98G
  if (HasLeadingEmptyMacro) {
3155
1.15M
    Result.setFlag(Token::LeadingEmptyMacro);
3156
1.15M
    HasLeadingEmptyMacro = false;
3157
1.15M
  }
3158
1.98G
3159
1.98G
  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3160
1.98G
  IsAtPhysicalStartOfLine = false;
3161
1.98G
  bool isRawLex = isLexingRawMode();
3162
1.98G
  (void) isRawLex;
3163
1.98G
  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3164
1.98G
  // (After the LexTokenInternal call, the lexer might be destroyed.)
3165
1.98G
  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3166
1.98G
  return returnedToken;
3167
1.98G
}
3168
3169
/// LexTokenInternal - This implements a simple C family lexer.  It is an
3170
/// extremely performance critical piece of code.  This assumes that the buffer
3171
/// has a null character at the end of the file.  This returns a preprocessing
3172
/// token, not a normal token, as such, it is an internal interface.  It assumes
3173
/// that the Flags of result have been cleared before calling this.
3174
1.98G
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3175
2.30G
LexNextToken:
3176
2.30G
  // New token, can't need cleaning yet.
3177
2.30G
  Result.clearFlag(Token::NeedsCleaning);
3178
2.30G
  Result.setIdentifierInfo(nullptr);
3179
2.30G
3180
2.30G
  // CurPtr - Cache BufferPtr in an automatic variable.
3181
2.30G
  const char *CurPtr = BufferPtr;
3182
2.30G
3183
2.30G
  // Small amounts of horizontal whitespace is very common between tokens.
3184
2.30G
  if ((*CurPtr == ' ') || 
(*CurPtr == '\t')1.67G
) {
3185
644M
    ++CurPtr;
3186
1.99G
    while ((*CurPtr == ' ') || 
(*CurPtr == '\t')657M
)
3187
1.35G
      ++CurPtr;
3188
644M
3189
644M
    // If we are keeping whitespace and other tokens, just return what we just
3190
644M
    // skipped.  The next lexer invocation will return the token after the
3191
644M
    // whitespace.
3192
644M
    if (isKeepWhitespaceMode()) {
3193
225k
      FormTokenWithChars(Result, CurPtr, tok::unknown);
3194
225k
      // FIXME: The next token will not have LeadingSpace set.
3195
225k
      return true;
3196
225k
    }
3197
643M
3198
643M
    BufferPtr = CurPtr;
3199
643M
    Result.setFlag(Token::LeadingSpace);
3200
643M
  }
3201
2.30G
3202
2.30G
  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3203
2.30G
3204
2.30G
  // Read a character, advancing over it.
3205
2.30G
  char Char = getAndAdvanceChar(CurPtr, Result);
3206
2.30G
  tok::TokenKind Kind;
3207
2.30G
3208
2.30G
  switch (Char) {
3209
1.62M
  case 0:  // Null.
3210
1.62M
    // Found end of file?
3211
1.62M
    if (CurPtr-1 == BufferEnd)
3212
1.62M
      return LexEndOfFile(Result, CurPtr-1);
3213
1.06k
3214
1.06k
    // Check if we are performing code completion.
3215
1.06k
    if (isCodeCompletionPoint(CurPtr-1)) {
3216
1.06k
      // Return the code-completion token.
3217
1.06k
      Result.startToken();
3218
1.06k
      FormTokenWithChars(Result, CurPtr, tok::code_completion);
3219
1.06k
      return true;
3220
1.06k
    }
3221
5
3222
5
    if (!isLexingRawMode())
3223
2
      Diag(CurPtr-1, diag::null_in_file);
3224
5
    Result.setFlag(Token::LeadingSpace);
3225
5
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3226
0
      return true; // KeepWhitespaceMode
3227
5
3228
5
    // We know the lexer hasn't changed, so just try again with this lexer.
3229
5
    // (We manually eliminate the tail call to avoid recursion.)
3230
5
    goto LexNextToken;
3231
5
3232
5
  case 26:  // DOS & CP/M EOF: "^Z".
3233
1
    // If we're in Microsoft extensions mode, treat this as end of file.
3234
1
    if (LangOpts.MicrosoftExt) {
3235
1
      if (!isLexingRawMode())
3236
1
        Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3237
1
      return LexEndOfFile(Result, CurPtr-1);
3238
1
    }
3239
0
3240
0
    // If Microsoft extensions are disabled, this is just random garbage.
3241
0
    Kind = tok::unknown;
3242
0
    break;
3243
0
3244
1.83k
  case '\r':
3245
1.83k
    if (CurPtr[0] == '\n')
3246
1.83k
      (void)getAndAdvanceChar(CurPtr, Result);
3247
1.83k
    LLVM_FALLTHROUGH;
3248
331M
  case '\n':
3249
331M
    // If we are inside a preprocessor directive and we see the end of line,
3250
331M
    // we know we are done with the directive, so return an EOD token.
3251
331M
    if (ParsingPreprocessorDirective) {
3252
67.7M
      // Done parsing the "line".
3253
67.7M
      ParsingPreprocessorDirective = false;
3254
67.7M
3255
67.7M
      // Restore comment saving mode, in case it was disabled for directive.
3256
67.7M
      if (PP)
3257
67.7M
        resetExtendedTokenMode();
3258
67.7M
3259
67.7M
      // Since we consumed a newline, we are back at the start of a line.
3260
67.7M
      IsAtStartOfLine = true;
3261
67.7M
      IsAtPhysicalStartOfLine = true;
3262
67.7M
3263
67.7M
      Kind = tok::eod;
3264
67.7M
      break;
3265
67.7M
    }
3266
263M
3267
263M
    // No leading whitespace seen so far.
3268
263M
    Result.clearFlag(Token::LeadingSpace);
3269
263M
3270
263M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3271
55.2k
      return true; // KeepWhitespaceMode
3272
263M
3273
263M
    // We only saw whitespace, so just try again with this lexer.
3274
263M
    // (We manually eliminate the tail call to avoid recursion.)
3275
263M
    goto LexNextToken;
3276
263M
  case ' ':
3277
4.29M
  case '\t':
3278
4.29M
  case '\f':
3279
4.29M
  case '\v':
3280
6.94M
  SkipHorizontalWhitespace:
3281
6.94M
    Result.setFlag(Token::LeadingSpace);
3282
6.94M
    if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3283
700
      return true; // KeepWhitespaceMode
3284
26.8M
3285
26.8M
  SkipIgnoredUnits:
3286
26.8M
    CurPtr = BufferPtr;
3287
26.8M
3288
26.8M
    // If the next token is obviously a // or /* */ comment, skip it efficiently
3289
26.8M
    // too (without going through the big switch stmt).
3290
26.8M
    if (CurPtr[0] == '/' && 
CurPtr[1] == '/'14.6M
&&
!inKeepCommentMode()14.6M
&&
3291
26.8M
        
LangOpts.LineComment14.6M
&&
3292
26.8M
        
(14.6M
LangOpts.CPlusPlus14.6M
||
!LangOpts.TraditionalCPP9.80M
)) {
3293
14.6M
      if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3294
0
        return true; // There is a token to return.
3295
14.6M
      goto SkipIgnoredUnits;
3296
14.6M
    } else 
if (12.2M
CurPtr[0] == '/'12.2M
&&
CurPtr[1] == '*'14.6k
&&
!inKeepCommentMode()12.0k
) {
3297
12.0k
      if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3298
0
        return true; // There is a token to return.
3299
12.0k
      goto SkipIgnoredUnits;
3300
12.2M
    } else if (isHorizontalWhitespace(*CurPtr)) {
3301
2.64M
      goto SkipHorizontalWhitespace;
3302
2.64M
    }
3303
9.59M
    // We only saw whitespace, so just try again with this lexer.
3304
9.59M
    // (We manually eliminate the tail call to avoid recursion.)
3305
9.59M
    goto LexNextToken;
3306
9.59M
3307
9.59M
  // C99 6.4.4.1: Integer Constants.
3308
9.59M
  // C99 6.4.4.2: Floating Constants.
3309
72.4M
  case '0': case '1': case '2': case '3': case '4':
3310
72.4M
  case '5': case '6': case '7': case '8': case '9':
3311
72.4M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3312
72.4M
    MIOpt.ReadToken();
3313
72.4M
    return LexNumericConstant(Result, CurPtr);
3314
72.4M
3315
72.4M
  case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
3316
9.72M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3317
9.72M
    MIOpt.ReadToken();
3318
9.72M
3319
9.72M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C115.96M
) {
3320
9.19M
      Char = getCharAndSize(CurPtr, SizeTmp);
3321
9.19M
3322
9.19M
      // UTF-16 string literal
3323
9.19M
      if (Char == '"')
3324
176
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3325
176
                                tok::utf16_string_literal);
3326
9.19M
3327
9.19M
      // UTF-16 character constant
3328
9.19M
      if (Char == '\'')
3329
100
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3330
100
                               tok::utf16_char_constant);
3331
9.19M
3332
9.19M
      // UTF-16 raw string literal
3333
9.19M
      if (Char == 'R' && 
LangOpts.CPlusPlus1130
&&
3334
9.19M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'22
)
3335
20
        return LexRawStringLiteral(Result,
3336
20
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3337
20
                                           SizeTmp2, Result),
3338
20
                               tok::utf16_string_literal);
3339
9.19M
3340
9.19M
      if (Char == '8') {
3341
5.03k
        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3342
5.03k
3343
5.03k
        // UTF-8 string literal
3344
5.03k
        if (Char2 == '"')
3345
126
          return LexStringLiteral(Result,
3346
126
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3347
126
                                           SizeTmp2, Result),
3348
126
                               tok::utf8_string_literal);
3349
4.90k
        if (Char2 == '\'' && 
LangOpts.CPlusPlus1749
)
3350
37
          return LexCharConstant(
3351
37
              Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3352
37
                                  SizeTmp2, Result),
3353
37
              tok::utf8_char_constant);
3354
4.87k
3355
4.87k
        if (Char2 == 'R' && 
LangOpts.CPlusPlus1134
) {
3356
26
          unsigned SizeTmp3;
3357
26
          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3358
26
          // UTF-8 raw string literal
3359
26
          if (Char3 == '"') {
3360
24
            return LexRawStringLiteral(Result,
3361
24
                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3362
24
                                           SizeTmp2, Result),
3363
24
                               SizeTmp3, Result),
3364
24
                   tok::utf8_string_literal);
3365
24
          }
3366
9.72M
        }
3367
4.87k
      }
3368
9.19M
    }
3369
9.72M
3370
9.72M
    // treat u like the start of an identifier.
3371
9.72M
    return LexIdentifier(Result, CurPtr);
3372
9.72M
3373
9.72M
  case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
3374
2.76M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3375
2.76M
    MIOpt.ReadToken();
3376
2.76M
3377
2.76M
    if (LangOpts.CPlusPlus11 || 
LangOpts.C112.09M
) {
3378
2.72M
      Char = getCharAndSize(CurPtr, SizeTmp);
3379
2.72M
3380
2.72M
      // UTF-32 string literal
3381
2.72M
      if (Char == '"')
3382
175
        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3383
175
                                tok::utf32_string_literal);
3384
2.72M
3385
2.72M
      // UTF-32 character constant
3386
2.72M
      if (Char == '\'')
3387
91
        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3388
91
                               tok::utf32_char_constant);
3389
2.72M
3390
2.72M
      // UTF-32 raw string literal
3391
2.72M
      if (Char == 'R' && 
LangOpts.CPlusPlus1177.7k
&&
3392
2.72M
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'7.51k
)
3393
22
        return LexRawStringLiteral(Result,
3394
22
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3395
22
                                           SizeTmp2, Result),
3396
22
                               tok::utf32_string_literal);
3397
2.76M
    }
3398
2.76M
3399
2.76M
    // treat U like the start of an identifier.
3400
2.76M
    return LexIdentifier(Result, CurPtr);
3401
2.76M
3402
2.76M
  case 'R': // Identifier or C++0x raw string literal
3403
734k
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3404
734k
    MIOpt.ReadToken();
3405
734k
3406
734k
    if (LangOpts.CPlusPlus11) {
3407
181k
      Char = getCharAndSize(CurPtr, SizeTmp);
3408
181k
3409
181k
      if (Char == '"')
3410
381
        return LexRawStringLiteral(Result,
3411
381
                                   ConsumeChar(CurPtr, SizeTmp, Result),
3412
381
                                   tok::string_literal);
3413
733k
    }
3414
733k
3415
733k
    // treat R like the start of an identifier.
3416
733k
    return LexIdentifier(Result, CurPtr);
3417
733k
3418
798k
  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3419
798k
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3420
798k
    MIOpt.ReadToken();
3421
798k
    Char = getCharAndSize(CurPtr, SizeTmp);
3422
798k
3423
798k
    // Wide string literal.
3424
798k
    if (Char == '"')
3425
1.70k
      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3426
1.70k
                              tok::wide_string_literal);
3427
796k
3428
796k
    // Wide raw string literal.
3429
796k
    if (LangOpts.CPlusPlus11 && 
Char == 'R'287k
&&
3430
796k
        
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"'119
)
3431
18
      return LexRawStringLiteral(Result,
3432
18
                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3433
18
                                           SizeTmp2, Result),
3434
18
                               tok::wide_string_literal);
3435
796k
3436
796k
    // Wide character constant.
3437
796k
    if (Char == '\'')
3438
1.06k
      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3439
1.06k
                             tok::wide_char_constant);
3440
795k
    // FALL THROUGH, treating L like the start of an identifier.
3441
795k
    LLVM_FALLTHROUGH;
3442
795k
3443
795k
  // C99 6.4.2: Identifiers.
3444
947M
  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3445
947M
  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3446
947M
  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3447
947M
  case 'V': case 'W': case 'X': case 'Y': case 'Z':
3448
947M
  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3449
947M
  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3450
947M
  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3451
947M
  case 'v': case 'w': case 'x': case 'y': case 'z':
3452
947M
  case '_':
3453
947M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3454
947M
    MIOpt.ReadToken();
3455
947M
    return LexIdentifier(Result, CurPtr);
3456
947M
3457
947M
  case '$':   // $ in identifiers.
3458
29.8k
    if (LangOpts.DollarIdents) {
3459
29.7k
      if (!isLexingRawMode())
3460
29.5k
        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3461
29.7k
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3462
29.7k
      MIOpt.ReadToken();
3463
29.7k
      return LexIdentifier(Result, CurPtr);
3464
29.7k
    }
3465
13
3466
13
    Kind = tok::unknown;
3467
13
    break;
3468
13
3469
13
  // C99 6.4.4: Character Constants.
3470
1.47M
  case '\'':
3471
1.47M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3472
1.47M
    MIOpt.ReadToken();
3473
1.47M
    return LexCharConstant(Result, CurPtr, tok::char_constant);
3474
13
3475
13
  // C99 6.4.5: String Literals.
3476
9.65M
  case '"':
3477
9.65M
    // Notify MIOpt that we read a non-whitespace/non-comment token.
3478
9.65M
    MIOpt.ReadToken();
3479
9.65M
    return LexStringLiteral(Result, CurPtr,
3480
9.65M
                            ParsingFilename ? 
tok::header_name87.8k
3481
9.65M
                                            : 
tok::string_literal9.56M
);
3482
13
3483
13
  // C99 6.4.6: Punctuators.
3484
308k
  case '?':
3485
308k
    Kind = tok::question;
3486
308k
    break;
3487
2.34M
  case '[':
3488
2.34M
    Kind = tok::l_square;
3489
2.34M
    break;
3490
2.34M
  case ']':
3491
2.34M
    Kind = tok::r_square;
3492
2.34M
    break;
3493
183M
  case '(':
3494
183M
    Kind = tok::l_paren;
3495
183M
    break;
3496
197M
  case ')':
3497
197M
    Kind = tok::r_paren;
3498
197M
    break;
3499
13.6M
  case '{':
3500
13.6M
    Kind = tok::l_brace;
3501
13.6M
    break;
3502
13.6M
  case '}':
3503
13.6M
    Kind = tok::r_brace;
3504
13.6M
    break;
3505
5.72M
  case '.':
3506
5.72M
    Char = getCharAndSize(CurPtr, SizeTmp);
3507
5.72M
    if (Char >= '0' && 
Char <= '9'3.79M
) {
3508
6.28k
      // Notify MIOpt that we read a non-whitespace/non-comment token.
3509
6.28k
      MIOpt.ReadToken();
3510
6.28k
3511
6.28k
      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
3512
5.72M
    } else if (LangOpts.CPlusPlus && 
Char == '*'4.14M
) {
3513
26.4k
      Kind = tok::periodstar;
3514
26.4k
      CurPtr += SizeTmp;
3515
5.69M
    } else if (Char == '.' &&
3516
5.69M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.'1.82M
) {
3517
1.81M
      Kind = tok::ellipsis;
3518
1.81M
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3519
1.81M
                           SizeTmp2, Result);
3520
3.87M
    } else {
3521
3.87M
      Kind = tok::period;
3522
3.87M
    }
3523
5.72M
    
break5.72M
;
3524
5.72M
  case '&':
3525
5.69M
    Char = getCharAndSize(CurPtr, SizeTmp);
3526
5.69M
    if (Char == '&') {
3527
2.14M
      Kind = tok::ampamp;
3528
2.14M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3529
3.55M
    } else if (Char == '=') {
3530
32.3k
      Kind = tok::ampequal;
3531
32.3k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3532
3.51M
    } else {
3533
3.51M
      Kind = tok::amp;
3534
3.51M
    }
3535
5.69M
    break;
3536
14.9M
  case '*':
3537
14.9M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3538
14.3k
      Kind = tok::starequal;
3539
14.3k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3540
14.9M
    } else {
3541
14.9M
      Kind = tok::star;
3542
14.9M
    }
3543
14.9M
    break;
3544
5.72M
  case '+':
3545
2.51M
    Char = getCharAndSize(CurPtr, SizeTmp);
3546
2.51M
    if (Char == '+') {
3547
728k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3548
728k
      Kind = tok::plusplus;
3549
1.78M
    } else if (Char == '=') {
3550
161k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3551
161k
      Kind = tok::plusequal;
3552
1.62M
    } else {
3553
1.62M
      Kind = tok::plus;
3554
1.62M
    }
3555
2.51M
    break;
3556
6.48M
  case '-':
3557
6.48M
    Char = getCharAndSize(CurPtr, SizeTmp);
3558
6.48M
    if (Char == '-') {      // --
3559
203k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3560
203k
      Kind = tok::minusminus;
3561
6.28M
    } else if (Char == '>' && 
LangOpts.CPlusPlus1.15M
&&
3562
6.28M
               
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*'992k
) { // C++ ->*
3563
4.01k
      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3564
4.01k
                           SizeTmp2, Result);
3565
4.01k
      Kind = tok::arrowstar;
3566
6.28M
    } else if (Char == '>') {   // ->
3567
1.14M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3568
1.14M
      Kind = tok::arrow;
3569
5.13M
    } else if (Char == '=') {   // -=
3570
84.1k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3571
84.1k
      Kind = tok::minusequal;
3572
5.04M
    } else {
3573
5.04M
      Kind = tok::minus;
3574
5.04M
    }
3575
6.48M
    break;
3576
5.72M
  case '~':
3577
303k
    Kind = tok::tilde;
3578
303k
    break;
3579
5.72M
  case '!':
3580
2.40M
    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
3581
516k
      Kind = tok::exclaimequal;
3582
516k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3583
1.89M
    } else {
3584
1.89M
      Kind = tok::exclaim;
3585
1.89M
    }
3586
2.40M
    break;
3587
50.5M
  case '/':
3588
50.5M
    // 6.4.9: Comments
3589
50.5M
    Char = getCharAndSize(CurPtr, SizeTmp);
3590
50.5M
    if (Char == '/') {         // Line comment.
3591
5.32M
      // Even if Line comments are disabled (e.g. in C89 mode), we generally
3592
5.32M
      // want to lex this as a comment.  There is one problem with this though,
3593
5.32M
      // that in one particular corner case, this can change the behavior of the
3594
5.32M
      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
3595
5.32M
      // this as "foo / bar" and languages with Line comments would lex it as
3596
5.32M
      // "foo".  Check to see if the character after the second slash is a '*'.
3597
5.32M
      // If so, we will lex that as a "/" instead of the start of a comment.
3598
5.32M
      // However, we never do this if we are just preprocessing.
3599
5.32M
      bool TreatAsComment = LangOpts.LineComment &&
3600
5.32M
                            
(5.31M
LangOpts.CPlusPlus5.31M
||
!LangOpts.TraditionalCPP2.40M
);
3601
5.32M
      if (!TreatAsComment)
3602
5.67k
        if (!(PP && 
PP->isPreprocessedOutput()5.39k
))
3603
5.58k
          TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
3604
5.32M
3605
5.32M
      if (TreatAsComment) {
3606
5.32M
        if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3607
5.32M
                            TokAtPhysicalStartOfLine))
3608
23.3k
          return true; // There is a token to return.
3609
5.29M
3610
5.29M
        // It is common for the tokens immediately after a // comment to be
3611
5.29M
        // whitespace (indentation for the next line).  Instead of going through
3612
5.29M
        // the big switch, handle it efficiently now.
3613
5.29M
        goto SkipIgnoredUnits;
3614
5.29M
      }
3615
5.32M
    }
3616
45.1M
3617
45.1M
    if (Char == '*') {  // /**/ comment.
3618
44.3M
      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3619
44.3M
                           TokAtPhysicalStartOfLine))
3620
1.81k
        return true; // There is a token to return.
3621
44.3M
3622
44.3M
      // We only saw whitespace, so just try again with this lexer.
3623
44.3M
      // (We manually eliminate the tail call to avoid recursion.)
3624
44.3M
      goto LexNextToken;
3625
44.3M
    }
3626
841k
3627
841k
    if (Char == '=') {
3628
8.32k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3629
8.32k
      Kind = tok::slashequal;
3630
832k
    } else {
3631
832k
      Kind = tok::slash;
3632
832k
    }
3633
841k
    break;
3634
841k
  case '%':
3635
57.3k
    Char = getCharAndSize(CurPtr, SizeTmp);
3636
57.3k
    if (Char == '=') {
3637
3.18k
      Kind = tok::percentequal;
3638
3.18k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3639
54.1k
    } else if (LangOpts.Digraphs && 
Char == '>'49.5k
) {
3640
10
      Kind = tok::r_brace;                             // '%>' -> '}'
3641
10
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3642
54.1k
    } else if (LangOpts.Digraphs && 
Char == ':'49.4k
) {
3643
15
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3644
15
      Char = getCharAndSize(CurPtr, SizeTmp);
3645
15
      if (Char == '%' && 
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':'0
) {
3646
0
        Kind = tok::hashhash;                          // '%:%:' -> '##'
3647
0
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3648
0
                             SizeTmp2, Result);
3649
15
      } else if (Char == '@' && 
LangOpts.MicrosoftExt0
) {// %:@ -> #@ -> Charize
3650
0
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3651
0
        if (!isLexingRawMode())
3652
0
          Diag(BufferPtr, diag::ext_charize_microsoft);
3653
0
        Kind = tok::hashat;
3654
15
      } else {                                         // '%:' -> '#'
3655
15
        // We parsed a # character.  If this occurs at the start of the line,
3656
15
        // it's actually the start of a preprocessing directive.  Callback to
3657
15
        // the preprocessor to handle it.
3658
15
        // TODO: -fpreprocessed mode??
3659
15
        if (TokAtPhysicalStartOfLine && !LexingRawMode && 
!Is_PragmaLexer12
)
3660
12
          goto HandleDirective;
3661
3
3662
3
        Kind = tok::hash;
3663
3
      }
3664
54.1k
    } else {
3665
54.1k
      Kind = tok::percent;
3666
54.1k
    }
3667
57.3k
    
break57.3k
;
3668
20.3M
  case '<':
3669
20.3M
    Char = getCharAndSize(CurPtr, SizeTmp);
3670
20.3M
    if (ParsingFilename) {
3671
2.04M
      return LexAngledStringLiteral(Result, CurPtr);
3672
18.2M
    } else if (Char == '<') {
3673
939k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3674
939k
      if (After == '=') {
3675
5.27k
        Kind = tok::lesslessequal;
3676
5.27k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3677
5.27k
                             SizeTmp2, Result);
3678
934k
      } else if (After == '<' && 
IsStartOfConflictMarker(CurPtr-1)167
) {
3679
2
        // If this is actually a '<<<<<<<' version control conflict marker,
3680
2
        // recognize it as such and recover nicely.
3681
2
        goto LexNextToken;
3682
934k
      } else if (After == '<' && 
HandleEndOfConflictMarker(CurPtr-1)165
) {
3683
0
        // If this is '<<<<' and we're in a Perforce-style conflict marker,
3684
0
        // ignore it.
3685
0
        goto LexNextToken;
3686
934k
      } else if (LangOpts.CUDA && 
After == '<'68
) {
3687
68
        Kind = tok::lesslessless;
3688
68
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3689
68
                             SizeTmp2, Result);
3690
934k
      } else {
3691
934k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3692
934k
        Kind = tok::lessless;
3693
934k
      }
3694
17.3M
    } else if (Char == '=') {
3695
176k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3696
176k
      if (After == '>') {
3697
1.26k
        if (getLangOpts().CPlusPlus2a) {
3698
871
          if (!isLexingRawMode())
3699
813
            Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
3700
871
          CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3701
871
                               SizeTmp2, Result);
3702
871
          Kind = tok::spaceship;
3703
871
          break;
3704
871
        }
3705
392
        // Suggest adding a space between the '<=' and the '>' to avoid a
3706
392
        // change in semantics if this turns up in C++ <=17 mode.
3707
392
        if (getLangOpts().CPlusPlus && !isLexingRawMode()) {
3708
9
          Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship)
3709
9
            << FixItHint::CreateInsertion(
3710
9
                   getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
3711
9
        }
3712
392
      }
3713
176k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3714
175k
      Kind = tok::lessequal;
3715
17.1M
    } else if (LangOpts.Digraphs && 
Char == ':'16.9M
) { // '<:' -> '['
3716
82
      if (LangOpts.CPlusPlus11 &&
3717
82
          
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':'51
) {
3718
36
        // C++0x [lex.pptoken]p3:
3719
36
        //  Otherwise, if the next three characters are <:: and the subsequent
3720
36
        //  character is neither : nor >, the < is treated as a preprocessor
3721
36
        //  token by itself and not as the first character of the alternative
3722
36
        //  token <:.
3723
36
        unsigned SizeTmp3;
3724
36
        char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3725
36
        if (After != ':' && 
After != '>'35
) {
3726
34
          Kind = tok::less;
3727
34
          if (!isLexingRawMode())
3728
31
            Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3729
34
          break;
3730
34
        }
3731
48
      }
3732
48
3733
48
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3734
48
      Kind = tok::l_square;
3735
17.1M
    } else if (LangOpts.Digraphs && 
Char == '%'16.9M
) { // '<%' -> '{'
3736
9
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3737
9
      Kind = tok::l_brace;
3738
17.1M
    } else if (Char == '#' && /*Not a trigraph*/ 
SizeTmp == 147
&&
3739
17.1M
               
lexEditorPlaceholder(Result, CurPtr)45
) {
3740
43
      return true;
3741
17.1M
    } else {
3742
17.1M
      Kind = tok::less;
3743
17.1M
    }
3744
20.3M
    
break18.2M
;
3745
20.3M
  case '>':
3746
16.0M
    Char = getCharAndSize(CurPtr, SizeTmp);
3747
16.0M
    if (Char == '=') {
3748
1.08M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3749
1.08M
      Kind = tok::greaterequal;
3750
14.9M
    } else if (Char == '>') {
3751
273k
      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3752
273k
      if (After == '=') {
3753
4.51k
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3754
4.51k
                             SizeTmp2, Result);
3755
4.51k
        Kind = tok::greatergreaterequal;
3756
268k
      } else if (After == '>' && 
IsStartOfConflictMarker(CurPtr-1)7.18k
) {
3757
2
        // If this is actually a '>>>>' conflict marker, recognize it as such
3758
2
        // and recover nicely.
3759
2
        goto LexNextToken;
3760
268k
      } else if (After == '>' && 
HandleEndOfConflictMarker(CurPtr-1)7.18k
) {
3761
0
        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3762
0
        goto LexNextToken;
3763
268k
      } else if (LangOpts.CUDA && 
After == '>'84
) {
3764
82
        Kind = tok::greatergreatergreater;
3765
82
        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3766
82
                             SizeTmp2, Result);
3767
268k
      } else {
3768
268k
        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3769
268k
        Kind = tok::greatergreater;
3770
268k
      }
3771
14.6M
    } else {
3772
14.6M
      Kind = tok::greater;
3773
14.6M
    }
3774
16.0M
    
break16.0M
;
3775
16.0M
  case '^':
3776
212k
    Char = getCharAndSize(CurPtr, SizeTmp);
3777
212k
    if (Char == '=') {
3778
19.1k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3779
19.1k
      Kind = tok::caretequal;
3780
193k
    } else if (LangOpts.OpenCL && 
Char == '^'345
) {
3781
1
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3782
1
      Kind = tok::caretcaret;
3783
193k
    } else {
3784
193k
      Kind = tok::caret;
3785
193k
    }
3786
212k
    break;
3787
16.0M
  case '|':
3788
1.56M
    Char = getCharAndSize(CurPtr, SizeTmp);
3789
1.56M
    if (Char == '=') {
3790
57.0k
      Kind = tok::pipeequal;
3791
57.0k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3792
1.50M
    } else if (Char == '|') {
3793
1.02M
      // If this is '|||||||' and we're in a conflict marker, ignore it.
3794
1.02M
      if (CurPtr[1] == '|' && 
HandleEndOfConflictMarker(CurPtr-1)19
)
3795
1
        goto LexNextToken;
3796
1.02M
      Kind = tok::pipepipe;
3797
1.02M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3798
1.02M
    } else {
3799
482k
      Kind = tok::pipe;
3800
482k
    }
3801
1.56M
    
break1.56M
;
3802
12.2M
  case ':':
3803
12.2M
    Char = getCharAndSize(CurPtr, SizeTmp);
3804
12.2M
    if (LangOpts.Digraphs && 
Char == '>'12.1M
) {
3805
21
      Kind = tok::r_square; // ':>' -> ']'
3806
21
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3807
12.2M
    } else if ((LangOpts.CPlusPlus ||
3808
12.2M
                
LangOpts.DoubleSquareBracketAttributes2.60M
) &&
3809
12.2M
               
Char == ':'9.66M
) {
3810
7.34M
      Kind = tok::coloncolon;
3811
7.34M
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3812
7.34M
    } else {
3813
4.92M
      Kind = tok::colon;
3814
4.92M
    }
3815
12.2M
    break;
3816
44.6M
  case ';':
3817
44.6M
    Kind = tok::semi;
3818
44.6M
    break;
3819
40.5M
  case '=':
3820
40.5M
    Char = getCharAndSize(CurPtr, SizeTmp);
3821
40.5M
    if (Char == '=') {
3822
867k
      // If this is '====' and we're in a conflict marker, ignore it.
3823
867k
      if (CurPtr[1] == '=' && 
HandleEndOfConflictMarker(CurPtr-1)38
)
3824
2
        goto LexNextToken;
3825
867k
3826
867k
      Kind = tok::equalequal;
3827
867k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3828
39.6M
    } else {
3829
39.6M
      Kind = tok::equal;
3830
39.6M
    }
3831
40.5M
    
break40.5M
;
3832
94.1M
  case ',':
3833
94.1M
    Kind = tok::comma;
3834
94.1M
    break;
3835
186M
  case '#':
3836
186M
    Char = getCharAndSize(CurPtr, SizeTmp);
3837
186M
    if (Char == '#') {
3838
242k
      Kind = tok::hashhash;
3839
242k
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3840
186M
    } else if (Char == '@' && 
LangOpts.MicrosoftExt3
) { // #@ -> Charize
3841
3
      Kind = tok::hashat;
3842
3
      if (!isLexingRawMode())
3843
3
        Diag(BufferPtr, diag::ext_charize_microsoft);
3844
3
      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3845
186M
    } else {
3846
186M
      // We parsed a # character.  If this occurs at the start of the line,
3847
186M
      // it's actually the start of a preprocessing directive.  Callback to
3848
186M
      // the preprocessor to handle it.
3849
186M
      // TODO: -fpreprocessed mode??
3850
186M
      if (TokAtPhysicalStartOfLine && 
!LexingRawMode186M
&&
!Is_PragmaLexer48.9M
)
3851
48.9M
        goto HandleDirective;
3852
137M
3853
137M
      Kind = tok::hash;
3854
137M
    }
3855
186M
    
break137M
;
3856
186M
3857
186M
  case '@':
3858
1.71M
    // Objective C support.
3859
1.71M
    if (CurPtr[-1] == '@' && LangOpts.ObjC)
3860
1.71M
      Kind = tok::at;
3861
1.47k
    else
3862
1.47k
      Kind = tok::unknown;
3863
1.71M
    break;
3864
186M
3865
186M
  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
3866
186M
  case '\\':
3867
777
    if (!LangOpts.AsmPreprocessor) {
3868
767
      if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
3869
93
        if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3870
0
          if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3871
0
            return true; // KeepWhitespaceMode
3872
0
3873
0
          // We only saw whitespace, so just try again with this lexer.
3874
0
          // (We manually eliminate the tail call to avoid recursion.)
3875
0
          goto LexNextToken;
3876
0
        }
3877
93
3878
93
        return LexUnicode(Result, CodePoint, CurPtr);
3879
93
      }
3880
767
    }
3881
684
3882
684
    Kind = tok::unknown;
3883
684
    break;
3884
684
3885
684
  default: {
3886
332
    if (isASCII(Char)) {
3887
165
      Kind = tok::unknown;
3888
165
      break;
3889
165
    }
3890
167
3891
167
    llvm::UTF32 CodePoint;
3892
167
3893
167
    // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
3894
167
    // an escaped newline.
3895
167
    --CurPtr;
3896
167
    llvm::ConversionResult Status =
3897
167
        llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
3898
167
                                  (const llvm::UTF8 *)BufferEnd,
3899
167
                                  &CodePoint,
3900
167
                                  llvm::strictConversion);
3901
167
    if (Status == llvm::conversionOK) {
3902
101
      if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
3903
6
        if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3904
0
          return true; // KeepWhitespaceMode
3905
6
3906
6
        // We only saw whitespace, so just try again with this lexer.
3907
6
        // (We manually eliminate the tail call to avoid recursion.)
3908
6
        goto LexNextToken;
3909
6
      }
3910
95
      return LexUnicode(Result, CodePoint, CurPtr);
3911
95
    }
3912
66
3913
66
    if (isLexingRawMode() || 
ParsingPreprocessorDirective4
||
3914
66
        
PP->isPreprocessedOutput()2
) {
3915
65
      ++CurPtr;
3916
65
      Kind = tok::unknown;
3917
65
      break;
3918
65
    }
3919
1
3920
1
    // Non-ASCII characters tend to creep into source code unintentionally.
3921
1
    // Instead of letting the parser complain about the unknown token,
3922
1
    // just diagnose the invalid UTF-8, then drop the character.
3923
1
    Diag(CurPtr, diag::err_invalid_utf8);
3924
1
3925
1
    BufferPtr = CurPtr+1;
3926
1
    // We're pretending the character didn't exist, so just try again with
3927
1
    // this lexer.
3928
1
    // (We manually eliminate the tail call to avoid recursion.)
3929
1
    goto LexNextToken;
3930
1
  }
3931
886M
  }
3932
886M
3933
886M
  // Notify MIOpt that we read a non-whitespace/non-comment token.
3934
886M
  MIOpt.ReadToken();
3935
886M
3936
886M
  // Update the location of token as well as BufferPtr.
3937
886M
  FormTokenWithChars(Result, CurPtr, Kind);
3938
886M
  return true;
3939
48.9M
3940
48.9M
HandleDirective:
3941
48.9M
  // We parsed a # character and it's the start of a preprocessing directive.
3942
48.9M
3943
48.9M
  FormTokenWithChars(Result, CurPtr, tok::hash);
3944
48.9M
  PP->HandleDirective(Result);
3945
48.9M
3946
48.9M
  if (PP->hadModuleLoaderFatalFailure()) {
3947
2
    // With a fatal failure in the module loader, we abort parsing.
3948
2
    assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
3949
2
    return true;
3950
2
  }
3951
48.9M
3952
48.9M
  // We parsed the directive; lex a token with the new state.
3953
48.9M
  return false;
3954
48.9M
}