Coverage Report

Created: 2023-05-31 04:38

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- CommentLexer.cpp -------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang/AST/CommentLexer.h"
10
#include "clang/AST/CommentCommandTraits.h"
11
#include "clang/AST/CommentDiagnostic.h"
12
#include "clang/Basic/CharInfo.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/ADT/StringSwitch.h"
15
#include "llvm/Support/ConvertUTF.h"
16
#include "llvm/Support/ErrorHandling.h"
17
18
namespace clang {
19
namespace comments {
20
21
0
void Token::dump(const Lexer &L, const SourceManager &SM) const {
22
0
  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23
0
  Loc.print(llvm::errs(), SM);
24
0
  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25
0
}
26
27
341
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28
341
  return isLetter(C);
29
341
}
30
31
165
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32
165
  return isDigit(C);
33
165
}
34
35
101
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36
101
  return isHexDigit(C);
37
101
}
38
39
static inline StringRef convertCodePointToUTF8(
40
                                      llvm::BumpPtrAllocator &Allocator,
41
36
                                      unsigned CodePoint) {
42
36
  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43
36
  char *ResolvedPtr = Resolved;
44
36
  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45
36
    return StringRef(Resolved, ResolvedPtr - Resolved);
46
0
  else
47
0
    return StringRef();
48
36
}
49
50
namespace {
51
52
#include "clang/AST/CommentHTMLTags.inc"
53
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55
} // end anonymous namespace
56
57
52
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58
  // Fast path, first check a few most widely used named character references.
59
52
  return llvm::StringSwitch<StringRef>(Name)
60
52
      .Case("amp", "&")
61
52
      .Case("lt", "<")
62
52
      .Case("gt", ">")
63
52
      .Case("quot", "\"")
64
52
      .Case("apos", "\'")
65
      // Slow path.
66
52
      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67
52
}
68
69
17
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70
17
  unsigned CodePoint = 0;
71
67
  for (unsigned i = 0, e = Name.size(); i != e; 
++i50
) {
72
50
    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73
50
    CodePoint *= 10;
74
50
    CodePoint += Name[i] - '0';
75
50
  }
76
17
  return convertCodePointToUTF8(Allocator, CodePoint);
77
17
}
78
79
19
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80
19
  unsigned CodePoint = 0;
81
57
  for (unsigned i = 0, e = Name.size(); i != e; 
++i38
) {
82
38
    CodePoint *= 16;
83
38
    const char C = Name[i];
84
38
    assert(isHTMLHexCharacterReferenceCharacter(C));
85
38
    CodePoint += llvm::hexDigitValue(C);
86
38
  }
87
19
  return convertCodePointToUTF8(Allocator, CodePoint);
88
19
}
89
90
2.12k
void Lexer::skipLineStartingDecorations() {
91
  // This function should be called only for C comments
92
2.12k
  assert(CommentState == LCS_InsideCComment);
93
94
2.12k
  if (BufferPtr == CommentEnd)
95
381
    return;
96
97
1.74k
  const char *NewBufferPtr = BufferPtr;
98
3.37k
  while (isHorizontalWhitespace(*NewBufferPtr))
99
1.80k
    if (++NewBufferPtr == CommentEnd)
100
183
      return;
101
1.56k
  if (*NewBufferPtr == '*')
102
1.07k
    BufferPtr = NewBufferPtr + 1;
103
1.56k
}
104
105
namespace {
106
/// Returns pointer to the first newline character in the string.
107
2.41k
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108
66.5k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr64.1k
) {
109
64.2k
    if (isVerticalWhitespace(*BufferPtr))
110
126
      return BufferPtr;
111
64.2k
  }
112
2.29k
  return BufferEnd;
113
2.41k
}
114
115
3.80k
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116
3.80k
  if (BufferPtr == BufferEnd)
117
1.67k
    return BufferPtr;
118
119
2.13k
  if (*BufferPtr == '\n')
120
2.11k
    BufferPtr++;
121
15
  else {
122
15
    assert(*BufferPtr == '\r');
123
15
    BufferPtr++;
124
15
    if (BufferPtr != BufferEnd && 
*BufferPtr == '\n'13
)
125
8
      BufferPtr++;
126
15
  }
127
2.13k
  return BufferPtr;
128
2.13k
}
129
130
const char *skipNamedCharacterReference(const char *BufferPtr,
131
54
                                        const char *BufferEnd) {
132
243
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr189
) {
133
242
    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134
53
      return BufferPtr;
135
242
  }
136
1
  return BufferEnd;
137
54
}
138
139
const char *skipDecimalCharacterReference(const char *BufferPtr,
140
19
                                          const char *BufferEnd) {
141
73
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr54
) {
142
72
    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143
18
      return BufferPtr;
144
72
  }
145
1
  return BufferEnd;
146
19
}
147
148
const char *skipHexCharacterReference(const char *BufferPtr,
149
23
                                      const char *BufferEnd) {
150
65
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr42
) {
151
63
    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152
21
      return BufferPtr;
153
63
  }
154
2
  return BufferEnd;
155
23
}
156
157
8.68k
bool isHTMLIdentifierStartingCharacter(char C) {
158
8.68k
  return isLetter(C);
159
8.68k
}
160
161
21.9k
bool isHTMLIdentifierCharacter(char C) {
162
21.9k
  return isAlphanumeric(C);
163
21.9k
}
164
165
4.99k
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166
21.4k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr16.4k
) {
167
21.4k
    if (!isHTMLIdentifierCharacter(*BufferPtr))
168
4.98k
      return BufferPtr;
169
21.4k
  }
170
11
  return BufferEnd;
171
4.99k
}
172
173
/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174
/// string allowed.
175
///
176
/// Returns pointer to closing quote.
177
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178
76
{
179
76
  const char Quote = *BufferPtr;
180
76
  assert(Quote == '\"' || Quote == '\'');
181
182
76
  BufferPtr++;
183
494
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr418
) {
184
488
    const char C = *BufferPtr;
185
488
    if (C == Quote && 
BufferPtr[-1] != '\\'76
)
186
70
      return BufferPtr;
187
488
  }
188
6
  return BufferEnd;
189
76
}
190
191
2.43k
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192
2.89k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr462
) {
193
2.62k
    if (!isWhitespace(*BufferPtr))
194
2.16k
      return BufferPtr;
195
2.62k
  }
196
272
  return BufferEnd;
197
2.43k
}
198
199
280
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200
280
  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201
280
}
202
203
11.9k
bool isCommandNameStartCharacter(char C) {
204
11.9k
  return isLetter(C);
205
11.9k
}
206
207
76.0k
bool isCommandNameCharacter(char C) {
208
76.0k
  return isAlphanumeric(C);
209
76.0k
}
210
211
11.9k
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212
76.9k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr65.0k
) {
213
76.0k
    if (!isCommandNameCharacter(*BufferPtr))
214
11.0k
      return BufferPtr;
215
76.0k
  }
216
901
  return BufferEnd;
217
11.9k
}
218
219
/// Return the one past end pointer for BCPL comments.
220
/// Handles newlines escaped with backslash or trigraph for backslahs.
221
31.6k
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222
31.6k
  const char *CurPtr = BufferPtr;
223
31.6k
  while (CurPtr != BufferEnd) {
224
1.12M
    while (!isVerticalWhitespace(*CurPtr)) {
225
1.09M
      CurPtr++;
226
1.09M
      if (CurPtr == BufferEnd)
227
3.86k
        return BufferEnd;
228
1.09M
    }
229
    // We found a newline, check if it is escaped.
230
27.7k
    const char *EscapePtr = CurPtr - 1;
231
27.7k
    while(isHorizontalWhitespace(*EscapePtr))
232
12
      EscapePtr--;
233
234
27.7k
    if (*EscapePtr == '\\' ||
235
27.7k
        
(27.7k
EscapePtr - 2 >= BufferPtr27.7k
&&
EscapePtr[0] == '/'21.9k
&&
236
27.7k
         
EscapePtr[-1] == '?'3
&&
EscapePtr[-2] == '?'3
)) {
237
      // We found an escaped newline.
238
9
      CurPtr = skipNewline(CurPtr, BufferEnd);
239
9
    } else
240
27.7k
      return CurPtr; // Not an escaped newline.
241
27.7k
  }
242
42
  return BufferEnd;
243
31.6k
}
244
245
/// Return the one past end pointer for C comments.
246
/// Very dumb, does not handle escaped newlines or trigraphs.
247
771
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248
43.6k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr42.8k
) {
249
43.6k
    if (*BufferPtr == '*') {
250
1.86k
      assert(BufferPtr + 1 != BufferEnd);
251
1.86k
      if (*(BufferPtr + 1) == '/')
252
771
        return BufferPtr;
253
1.86k
    }
254
43.6k
  }
255
0
  llvm_unreachable("buffer end hit before '*/' was seen");
256
0
}
257
258
} // end anonymous namespace
259
260
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261
102k
                               tok::TokenKind Kind) {
262
102k
  const unsigned TokLen = TokEnd - BufferPtr;
263
102k
  Result.setLocation(getSourceLocation(BufferPtr));
264
102k
  Result.setKind(Kind);
265
102k
  Result.setLength(TokLen);
266
102k
#ifndef NDEBUG
267
102k
  Result.TextPtr = "<UNSET>";
268
102k
  Result.IntVal = 7;
269
102k
#endif
270
102k
  BufferPtr = TokEnd;
271
102k
}
272
273
40.3k
const char *Lexer::skipTextToken() {
274
40.3k
  const char *TokenPtr = BufferPtr;
275
40.3k
  assert(TokenPtr < CommentEnd);
276
40.3k
  StringRef TokStartSymbols = ParseCommands ? 
"\n\r\\@\"&<"40.1k
:
"\n\r"155
;
277
278
40.3k
again:
279
40.3k
  size_t End =
280
40.3k
      StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281
40.3k
  if (End == StringRef::npos)
282
22.8k
    return CommentEnd;
283
284
  // Doxygen doesn't recognize any commands in a one-line double quotation.
285
  // If we don't find an ending quotation mark, we pretend it never began.
286
17.4k
  if (*(TokenPtr + End) == '\"') {
287
67
    TokenPtr += End + 1;
288
67
    End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289
67
    if (End != StringRef::npos && 
*(TokenPtr + End) == '\"'53
)
290
53
      TokenPtr += End + 1;
291
67
    goto again;
292
67
  }
293
17.4k
  return TokenPtr + End;
294
17.4k
}
295
296
62.2k
void Lexer::lexCommentText(Token &T) {
297
62.2k
  assert(CommentState == LCS_InsideBCPLComment ||
298
62.2k
         CommentState == LCS_InsideCComment);
299
300
  // Handles lexing non-command text, i.e. text and newline.
301
62.2k
  auto HandleNonCommandToken = [&]() -> void {
302
42.4k
    assert(State == LS_Normal);
303
304
42.4k
    const char *TokenPtr = BufferPtr;
305
42.4k
    assert(TokenPtr < CommentEnd);
306
42.4k
    switch (*TokenPtr) {
307
2.08k
      case '\n':
308
2.08k
      case '\r':
309
2.08k
          TokenPtr = skipNewline(TokenPtr, CommentEnd);
310
2.08k
          formTokenWithChars(T, TokenPtr, tok::newline);
311
312
2.08k
          if (CommentState == LCS_InsideCComment)
313
2.07k
            skipLineStartingDecorations();
314
2.08k
          return;
315
316
40.3k
      default:
317
40.3k
        return formTextToken(T, skipTextToken());
318
42.4k
    }
319
42.4k
  };
320
321
62.2k
  if (!ParseCommands)
322
173
    return HandleNonCommandToken();
323
324
62.1k
  switch (State) {
325
59.3k
  case LS_Normal:
326
59.3k
    break;
327
288
  case LS_VerbatimBlockFirstLine:
328
288
    lexVerbatimBlockFirstLine(T);
329
288
    return;
330
1.73k
  case LS_VerbatimBlockBody:
331
1.73k
    lexVerbatimBlockBody(T);
332
1.73k
    return;
333
151
  case LS_VerbatimLineText:
334
151
    lexVerbatimLineText(T);
335
151
    return;
336
508
  case LS_HTMLStartTag:
337
508
    lexHTMLStartTag(T);
338
508
    return;
339
118
  case LS_HTMLEndTag:
340
118
    lexHTMLEndTag(T);
341
118
    return;
342
62.1k
  }
343
344
59.3k
  assert(State == LS_Normal);
345
59.3k
  const char *TokenPtr = BufferPtr;
346
59.3k
  assert(TokenPtr < CommentEnd);
347
59.3k
  switch(*TokenPtr) {
348
11.4k
    case '\\':
349
12.0k
    case '@': {
350
      // Commands that start with a backslash and commands that start with
351
      // 'at' have equivalent semantics.  But we keep information about the
352
      // exact syntax in AST for comments.
353
12.0k
      tok::TokenKind CommandKind =
354
12.0k
          (*TokenPtr == '@') ? 
tok::at_command596
:
tok::backslash_command11.4k
;
355
12.0k
      TokenPtr++;
356
12.0k
      if (TokenPtr == CommentEnd) {
357
12
        formTextToken(T, TokenPtr);
358
12
        return;
359
12
      }
360
12.0k
      char C = *TokenPtr;
361
12.0k
      switch (C) {
362
11.9k
      default:
363
11.9k
        break;
364
365
11.9k
      
case '\\': 6
case '@': 12
case '&': 18
case '$':
366
50
      
case '#': 31
case '<': 37
case '>': 43
case '%':
367
68
      
case '\"': 56
case '.': 62
case ':':
368
        // This is one of \\ \@ \& \$ etc escape sequences.
369
68
        TokenPtr++;
370
68
        if (C == ':' && 
TokenPtr != CommentEnd6
&&
*TokenPtr == ':'6
) {
371
          // This is the \:: escape sequence.
372
6
          TokenPtr++;
373
6
        }
374
68
        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375
68
        formTokenWithChars(T, TokenPtr, tok::text);
376
68
        T.setText(UnescapedText);
377
68
        return;
378
12.0k
      }
379
380
      // Don't make zero-length commands.
381
11.9k
      if (!isCommandNameStartCharacter(*TokenPtr)) {
382
12
        formTextToken(T, TokenPtr);
383
12
        return;
384
12
      }
385
386
11.9k
      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387
11.9k
      unsigned Length = TokenPtr - (BufferPtr + 1);
388
389
      // Hardcoded support for lexing LaTeX formula commands
390
      // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391
11.9k
      if (Length == 1 && 
TokenPtr[-1] == 'f'2.01k
&&
TokenPtr != CommentEnd34
) {
392
34
        C = *TokenPtr;
393
34
        if (C == '$' || 
C == '('17
||
C == ')'12
||
C == '['12
||
C == ']'7
||
394
34
            
C == '{'7
||
C == '}'2
) {
395
32
          TokenPtr++;
396
32
          Length++;
397
32
        }
398
34
      }
399
400
11.9k
      StringRef CommandName(BufferPtr + 1, Length);
401
402
11.9k
      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403
11.9k
      if (!Info) {
404
314
        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405
11
          StringRef CorrectedName = Info->Name;
406
11
          SourceLocation Loc = getSourceLocation(BufferPtr);
407
11
          SourceLocation EndLoc = getSourceLocation(TokenPtr);
408
11
          SourceRange FullRange = SourceRange(Loc, EndLoc);
409
11
          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410
11
          Diag(Loc, diag::warn_correct_comment_command_name)
411
11
            << FullRange << CommandName << CorrectedName
412
11
            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413
303
        } else {
414
303
          formTokenWithChars(T, TokenPtr, tok::unknown_command);
415
303
          T.setUnknownCommandName(CommandName);
416
303
          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417
303
              << SourceRange(T.getLocation(), T.getEndLocation());
418
303
          return;
419
303
        }
420
314
      }
421
11.6k
      if (Info->IsVerbatimBlockCommand) {
422
300
        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423
300
        return;
424
300
      }
425
11.3k
      if (Info->IsVerbatimLineCommand) {
426
155
        setupAndLexVerbatimLine(T, TokenPtr, Info);
427
155
        return;
428
155
      }
429
11.2k
      formTokenWithChars(T, TokenPtr, CommandKind);
430
11.2k
      T.setCommandID(Info->getID());
431
11.2k
      return;
432
11.3k
    }
433
434
100
    case '&':
435
100
      lexHTMLCharacterReference(T);
436
100
      return;
437
438
4.90k
    case '<': {
439
4.90k
      TokenPtr++;
440
4.90k
      if (TokenPtr == CommentEnd) {
441
1
        formTextToken(T, TokenPtr);
442
1
        return;
443
1
      }
444
4.90k
      const char C = *TokenPtr;
445
4.90k
      if (isHTMLIdentifierStartingCharacter(C))
446
3.39k
        setupAndLexHTMLStartTag(T);
447
1.50k
      else if (C == '/')
448
1.48k
        setupAndLexHTMLEndTag(T);
449
22
      else
450
22
        formTextToken(T, TokenPtr);
451
4.90k
      return;
452
4.90k
    }
453
454
42.2k
    default:
455
42.2k
      return HandleNonCommandToken();
456
59.3k
  }
457
59.3k
}
458
459
void Lexer::setupAndLexVerbatimBlock(Token &T,
460
                                     const char *TextBegin,
461
300
                                     char Marker, const CommandInfo *Info) {
462
300
  assert(Info->IsVerbatimBlockCommand);
463
464
300
  VerbatimBlockEndCommandName.clear();
465
300
  VerbatimBlockEndCommandName.append(Marker == '\\' ? 
"\\"294
:
"@"6
);
466
300
  VerbatimBlockEndCommandName.append(Info->EndCommandName);
467
468
300
  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469
300
  T.setVerbatimBlockID(Info->getID());
470
471
  // If there is a newline following the verbatim opening command, skip the
472
  // newline so that we don't create an tok::verbatim_block_line with empty
473
  // text content.
474
300
  if (BufferPtr != CommentEnd &&
475
300
      
isVerticalWhitespace(*BufferPtr)198
) {
476
10
    BufferPtr = skipNewline(BufferPtr, CommentEnd);
477
10
    State = LS_VerbatimBlockBody;
478
10
    return;
479
10
  }
480
481
290
  State = LS_VerbatimBlockFirstLine;
482
290
}
483
484
2.02k
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485
2.26k
again:
486
2.26k
  assert(BufferPtr < CommentEnd);
487
488
  // FIXME: It would be better to scan the text once, finding either the block
489
  // end command or newline.
490
  //
491
  // Extract current line.
492
2.26k
  const char *Newline = findNewline(BufferPtr, CommentEnd);
493
2.26k
  StringRef Line(BufferPtr, Newline - BufferPtr);
494
495
  // Look for end command in current line.
496
2.26k
  size_t Pos = Line.find(VerbatimBlockEndCommandName);
497
2.26k
  const char *TextEnd;
498
2.26k
  const char *NextLine;
499
2.26k
  if (Pos == StringRef::npos) {
500
    // Current line is completely verbatim.
501
1.69k
    TextEnd = Newline;
502
1.69k
    NextLine = skipNewline(Newline, CommentEnd);
503
1.69k
  } else 
if (567
Pos == 0567
) {
504
    // Current line contains just an end command.
505
287
    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506
287
    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507
287
    formTokenWithChars(T, End, tok::verbatim_block_end);
508
287
    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509
287
    State = LS_Normal;
510
287
    return;
511
287
  } else {
512
    // There is some text, followed by end command.  Extract text first.
513
280
    TextEnd = BufferPtr + Pos;
514
280
    NextLine = TextEnd;
515
    // If there is only whitespace before end command, skip whitespace.
516
280
    if (isWhitespace(BufferPtr, TextEnd)) {
517
243
      BufferPtr = TextEnd;
518
243
      goto again;
519
243
    }
520
280
  }
521
522
1.73k
  StringRef Text(BufferPtr, TextEnd - BufferPtr);
523
1.73k
  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524
1.73k
  T.setVerbatimBlockText(Text);
525
526
1.73k
  State = LS_VerbatimBlockBody;
527
1.73k
}
528
529
1.73k
void Lexer::lexVerbatimBlockBody(Token &T) {
530
1.73k
  assert(State == LS_VerbatimBlockBody);
531
532
1.73k
  if (CommentState == LCS_InsideCComment)
533
50
    skipLineStartingDecorations();
534
535
1.73k
  if (BufferPtr == CommentEnd) {
536
4
    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537
4
    T.setVerbatimBlockText("");
538
4
    return;
539
4
  }
540
541
1.73k
  lexVerbatimBlockFirstLine(T);
542
1.73k
}
543
544
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545
155
                                    const CommandInfo *Info) {
546
155
  assert(Info->IsVerbatimLineCommand);
547
155
  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548
155
  T.setVerbatimLineID(Info->getID());
549
550
155
  State = LS_VerbatimLineText;
551
155
}
552
553
151
void Lexer::lexVerbatimLineText(Token &T) {
554
151
  assert(State == LS_VerbatimLineText);
555
556
  // Extract current line.
557
151
  const char *Newline = findNewline(BufferPtr, CommentEnd);
558
151
  StringRef Text(BufferPtr, Newline - BufferPtr);
559
151
  formTokenWithChars(T, Newline, tok::verbatim_line_text);
560
151
  T.setVerbatimLineText(Text);
561
562
151
  State = LS_Normal;
563
151
}
564
565
100
void Lexer::lexHTMLCharacterReference(Token &T) {
566
100
  const char *TokenPtr = BufferPtr;
567
100
  assert(*TokenPtr == '&');
568
100
  TokenPtr++;
569
100
  if (TokenPtr == CommentEnd) {
570
1
    formTextToken(T, TokenPtr);
571
1
    return;
572
1
  }
573
99
  const char *NamePtr;
574
99
  bool isNamed = false;
575
99
  bool isDecimal = false;
576
99
  char C = *TokenPtr;
577
99
  if (isHTMLNamedCharacterReferenceCharacter(C)) {
578
54
    NamePtr = TokenPtr;
579
54
    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580
54
    isNamed = true;
581
54
  } else 
if (45
C == '#'45
) {
582
44
    TokenPtr++;
583
44
    if (TokenPtr == CommentEnd) {
584
1
      formTextToken(T, TokenPtr);
585
1
      return;
586
1
    }
587
43
    C = *TokenPtr;
588
43
    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589
19
      NamePtr = TokenPtr;
590
19
      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591
19
      isDecimal = true;
592
24
    } else if (C == 'x' || 
C == 'X'7
) {
593
23
      TokenPtr++;
594
23
      NamePtr = TokenPtr;
595
23
      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596
23
    } else {
597
1
      formTextToken(T, TokenPtr);
598
1
      return;
599
1
    }
600
43
  } else {
601
1
    formTextToken(T, TokenPtr);
602
1
    return;
603
1
  }
604
96
  if (NamePtr == TokenPtr || 
TokenPtr == CommentEnd94
||
605
96
      
*TokenPtr != ';'91
) {
606
8
    formTextToken(T, TokenPtr);
607
8
    return;
608
8
  }
609
88
  StringRef Name(NamePtr, TokenPtr - NamePtr);
610
88
  TokenPtr++; // Skip semicolon.
611
88
  StringRef Resolved;
612
88
  if (isNamed)
613
52
    Resolved = resolveHTMLNamedCharacterReference(Name);
614
36
  else if (isDecimal)
615
17
    Resolved = resolveHTMLDecimalCharacterReference(Name);
616
19
  else
617
19
    Resolved = resolveHTMLHexCharacterReference(Name);
618
619
88
  if (Resolved.empty()) {
620
0
    formTextToken(T, TokenPtr);
621
0
    return;
622
0
  }
623
88
  formTokenWithChars(T, TokenPtr, tok::text);
624
88
  T.setText(Resolved);
625
88
}
626
627
3.39k
void Lexer::setupAndLexHTMLStartTag(Token &T) {
628
3.39k
  assert(BufferPtr[0] == '<' &&
629
3.39k
         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630
3.39k
  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631
3.39k
  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632
3.39k
  if (!isHTMLTagName(Name)) {
633
3.13k
    formTextToken(T, TagNameEnd);
634
3.13k
    return;
635
3.13k
  }
636
637
263
  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638
263
  T.setHTMLTagStartName(Name);
639
640
263
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641
642
263
  const char C = *BufferPtr;
643
263
  if (BufferPtr != CommentEnd &&
644
263
      
(260
C == '>'260
||
C == '/'145
||
isHTMLIdentifierStartingCharacter(C)112
))
645
250
    State = LS_HTMLStartTag;
646
263
}
647
648
508
void Lexer::lexHTMLStartTag(Token &T) {
649
508
  assert(State == LS_HTMLStartTag);
650
651
508
  const char *TokenPtr = BufferPtr;
652
508
  char C = *TokenPtr;
653
508
  if (isHTMLIdentifierCharacter(C)) {
654
114
    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655
114
    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656
114
    formTokenWithChars(T, TokenPtr, tok::html_ident);
657
114
    T.setHTMLIdent(Ident);
658
394
  } else {
659
394
    switch (C) {
660
100
    case '=':
661
100
      TokenPtr++;
662
100
      formTokenWithChars(T, TokenPtr, tok::html_equals);
663
100
      break;
664
71
    case '\"':
665
76
    case '\'': {
666
76
      const char *OpenQuote = TokenPtr;
667
76
      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668
76
      const char *ClosingQuote = TokenPtr;
669
76
      if (TokenPtr != CommentEnd) // Skip closing quote.
670
70
        TokenPtr++;
671
76
      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672
76
      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673
76
                                      ClosingQuote - (OpenQuote + 1)));
674
76
      break;
675
71
    }
676
169
    case '>':
677
169
      TokenPtr++;
678
169
      formTokenWithChars(T, TokenPtr, tok::html_greater);
679
169
      State = LS_Normal;
680
169
      return;
681
49
    case '/':
682
49
      TokenPtr++;
683
49
      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684
47
        TokenPtr++;
685
47
        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686
47
      } else
687
2
        formTextToken(T, TokenPtr);
688
689
49
      State = LS_Normal;
690
49
      return;
691
394
    }
692
394
  }
693
694
  // Now look ahead and return to normal state if we don't see any HTML tokens
695
  // ahead.
696
290
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697
290
  if (BufferPtr == CommentEnd) {
698
23
    State = LS_Normal;
699
23
    return;
700
23
  }
701
702
267
  C = *BufferPtr;
703
267
  if (!isHTMLIdentifierStartingCharacter(C) &&
704
267
      
C != '='255
&&
C != '\"'155
&&
C != '\''84
&&
C != '>'79
&&
C != '/'25
) {
705
9
    State = LS_Normal;
706
9
    return;
707
9
  }
708
267
}
709
710
1.48k
void Lexer::setupAndLexHTMLEndTag(Token &T) {
711
1.48k
  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712
713
1.48k
  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714
1.48k
  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715
1.48k
  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716
1.48k
  if (!isHTMLTagName(Name)) {
717
1.36k
    formTextToken(T, TagNameEnd);
718
1.36k
    return;
719
1.36k
  }
720
721
120
  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722
723
120
  formTokenWithChars(T, End, tok::html_end_tag);
724
120
  T.setHTMLTagEndName(Name);
725
726
120
  if (BufferPtr != CommentEnd && 
*BufferPtr == '>'118
)
727
118
    State = LS_HTMLEndTag;
728
120
}
729
730
118
void Lexer::lexHTMLEndTag(Token &T) {
731
118
  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732
733
118
  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734
118
  State = LS_Normal;
735
118
}
736
737
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738
             const CommandTraits &Traits, SourceLocation FileLoc,
739
             const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740
    : Allocator(Allocator), Diags(Diags), Traits(Traits),
741
      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742
      FileLoc(FileLoc), ParseCommands(ParseCommands),
743
4.73k
      CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745
102k
void Lexer::lex(Token &T) {
746
166k
again:
747
166k
  switch (CommentState) {
748
39.9k
  case LCS_BeforeComment:
749
39.9k
    if (BufferPtr == BufferEnd) {
750
7.54k
      formTokenWithChars(T, BufferPtr, tok::eof);
751
7.54k
      return;
752
7.54k
    }
753
754
32.3k
    assert(*BufferPtr == '/');
755
32.3k
    BufferPtr++; // Skip first slash.
756
32.3k
    switch(*BufferPtr) {
757
31.6k
    case '/': { // BCPL comment.
758
31.6k
      BufferPtr++; // Skip second slash.
759
760
31.6k
      if (BufferPtr != BufferEnd) {
761
        // Skip Doxygen magic marker, if it is present.
762
        // It might be missing because of a typo //< or /*<, or because we
763
        // merged this non-Doxygen comment into a bunch of Doxygen comments
764
        // around it: /** ... */ /* ... */ /** ... */
765
31.6k
        const char C = *BufferPtr;
766
31.6k
        if (C == '/' || 
C == '!'303
)
767
31.3k
          BufferPtr++;
768
31.6k
      }
769
770
      // Skip less-than symbol that marks trailing comments.
771
      // Skip it even if the comment is not a Doxygen one, because //< and /*<
772
      // are frequent typos.
773
31.6k
      if (BufferPtr != BufferEnd && 
*BufferPtr == '<'31.5k
)
774
138
        BufferPtr++;
775
776
31.6k
      CommentState = LCS_InsideBCPLComment;
777
31.6k
      if (State != LS_VerbatimBlockBody && 
State != LS_VerbatimBlockFirstLine29.9k
)
778
29.8k
        State = LS_Normal;
779
31.6k
      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780
31.6k
      goto again;
781
0
    }
782
771
    case '*': { // C comment.
783
771
      BufferPtr++; // Skip star.
784
785
      // Skip Doxygen magic marker.
786
771
      const char C = *BufferPtr;
787
771
      if ((C == '*' && 
*(BufferPtr + 1) != '/'620
) ||
C == '!'152
)
788
746
        BufferPtr++;
789
790
      // Skip less-than symbol that marks trailing comments.
791
771
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
792
40
        BufferPtr++;
793
794
771
      CommentState = LCS_InsideCComment;
795
771
      State = LS_Normal;
796
771
      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797
771
      goto again;
798
0
    }
799
0
    default:
800
0
      llvm_unreachable("second character of comment should be '/' or '*'");
801
32.3k
    }
802
803
32.3k
  case LCS_BetweenComments: {
804
    // Consecutive comments are extracted only if there is only whitespace
805
    // between them.  So we can search for the start of the next comment.
806
32.3k
    const char *EndWhitespace = BufferPtr;
807
60.2k
    while(EndWhitespace != BufferEnd && 
*EndWhitespace != '/'55.6k
)
808
27.9k
      EndWhitespace++;
809
810
    // Turn any whitespace between comments (and there is only whitespace
811
    // between them -- guaranteed by comment extraction) into a newline.  We
812
    // have two newlines between C comments in total (first one was synthesized
813
    // after a comment).
814
32.3k
    formTokenWithChars(T, EndWhitespace, tok::newline);
815
816
32.3k
    CommentState = LCS_BeforeComment;
817
32.3k
    break;
818
32.3k
  }
819
820
88.1k
  case LCS_InsideBCPLComment:
821
94.5k
  case LCS_InsideCComment:
822
94.5k
    if (BufferPtr != CommentEnd) {
823
62.2k
      lexCommentText(T);
824
62.2k
      break;
825
62.2k
    } else {
826
      // Skip C comment closing sequence.
827
32.3k
      if (CommentState == LCS_InsideCComment) {
828
768
        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829
768
        BufferPtr += 2;
830
768
        assert(BufferPtr <= BufferEnd);
831
832
        // Synthenize newline just after the C comment, regardless if there is
833
        // actually a newline.
834
768
        formTokenWithChars(T, BufferPtr, tok::newline);
835
836
768
        CommentState = LCS_BetweenComments;
837
768
        break;
838
31.5k
      } else {
839
        // Don't synthesized a newline after BCPL comment.
840
31.5k
        CommentState = LCS_BetweenComments;
841
31.5k
        goto again;
842
31.5k
      }
843
32.3k
    }
844
166k
  }
845
166k
}
846
847
StringRef Lexer::getSpelling(const Token &Tok,
848
155
                             const SourceManager &SourceMgr) const {
849
155
  SourceLocation Loc = Tok.getLocation();
850
155
  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852
155
  bool InvalidTemp = false;
853
155
  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854
155
  if (InvalidTemp)
855
0
    return StringRef();
856
857
155
  const char *Begin = File.data() + LocInfo.second;
858
155
  return StringRef(Begin, Tok.getLength());
859
155
}
860
861
} // end namespace comments
862
} // end namespace clang