Coverage Report

Created: 2022-01-18 06:27

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- CommentLexer.cpp -------------------------------------------------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
9
#include "clang/AST/CommentLexer.h"
10
#include "clang/AST/CommentCommandTraits.h"
11
#include "clang/AST/CommentDiagnostic.h"
12
#include "clang/Basic/CharInfo.h"
13
#include "llvm/ADT/StringExtras.h"
14
#include "llvm/ADT/StringSwitch.h"
15
#include "llvm/Support/ConvertUTF.h"
16
#include "llvm/Support/ErrorHandling.h"
17
18
namespace clang {
19
namespace comments {
20
21
0
void Token::dump(const Lexer &L, const SourceManager &SM) const {
22
0
  llvm::errs() << "comments::Token Kind=" << Kind << " ";
23
0
  Loc.print(llvm::errs(), SM);
24
0
  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25
0
}
26
27
341
static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28
341
  return isLetter(C);
29
341
}
30
31
165
static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32
165
  return isDigit(C);
33
165
}
34
35
101
static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36
101
  return isHexDigit(C);
37
101
}
38
39
static inline StringRef convertCodePointToUTF8(
40
                                      llvm::BumpPtrAllocator &Allocator,
41
36
                                      unsigned CodePoint) {
42
36
  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43
36
  char *ResolvedPtr = Resolved;
44
36
  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45
36
    return StringRef(Resolved, ResolvedPtr - Resolved);
46
0
  else
47
0
    return StringRef();
48
36
}
49
50
namespace {
51
52
#include "clang/AST/CommentHTMLTags.inc"
53
#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54
55
} // end anonymous namespace
56
57
52
StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58
  // Fast path, first check a few most widely used named character references.
59
52
  return llvm::StringSwitch<StringRef>(Name)
60
52
      .Case("amp", "&")
61
52
      .Case("lt", "<")
62
52
      .Case("gt", ">")
63
52
      .Case("quot", "\"")
64
52
      .Case("apos", "\'")
65
      // Slow path.
66
52
      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67
52
}
68
69
17
StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70
17
  unsigned CodePoint = 0;
71
67
  for (unsigned i = 0, e = Name.size(); i != e; 
++i50
) {
72
50
    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73
0
    CodePoint *= 10;
74
50
    CodePoint += Name[i] - '0';
75
50
  }
76
17
  return convertCodePointToUTF8(Allocator, CodePoint);
77
17
}
78
79
19
StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80
19
  unsigned CodePoint = 0;
81
57
  for (unsigned i = 0, e = Name.size(); i != e; 
++i38
) {
82
38
    CodePoint *= 16;
83
38
    const char C = Name[i];
84
38
    assert(isHTMLHexCharacterReferenceCharacter(C));
85
0
    CodePoint += llvm::hexDigitValue(C);
86
38
  }
87
19
  return convertCodePointToUTF8(Allocator, CodePoint);
88
19
}
89
90
1.82k
void Lexer::skipLineStartingDecorations() {
91
  // This function should be called only for C comments
92
1.82k
  assert(CommentState == LCS_InsideCComment);
93
94
1.82k
  if (BufferPtr == CommentEnd)
95
381
    return;
96
97
1.43k
  const char *NewBufferPtr = BufferPtr;
98
2.78k
  while (isHorizontalWhitespace(*NewBufferPtr))
99
1.49k
    if (++NewBufferPtr == CommentEnd)
100
154
      return;
101
1.28k
  if (*NewBufferPtr == '*')
102
799
    BufferPtr = NewBufferPtr + 1;
103
1.28k
}
104
105
namespace {
106
/// Returns pointer to the first newline character in the string.
107
326
const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108
5.21k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr4.89k
) {
109
5.02k
    if (isVerticalWhitespace(*BufferPtr))
110
132
      return BufferPtr;
111
5.02k
  }
112
194
  return BufferEnd;
113
326
}
114
115
1.85k
const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116
1.85k
  if (BufferPtr == BufferEnd)
117
29
    return BufferPtr;
118
119
1.82k
  if (*BufferPtr == '\n')
120
1.80k
    BufferPtr++;
121
15
  else {
122
15
    assert(*BufferPtr == '\r');
123
0
    BufferPtr++;
124
15
    if (BufferPtr != BufferEnd && 
*BufferPtr == '\n'13
)
125
8
      BufferPtr++;
126
15
  }
127
0
  return BufferPtr;
128
1.85k
}
129
130
const char *skipNamedCharacterReference(const char *BufferPtr,
131
54
                                        const char *BufferEnd) {
132
243
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr189
) {
133
242
    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134
53
      return BufferPtr;
135
242
  }
136
1
  return BufferEnd;
137
54
}
138
139
const char *skipDecimalCharacterReference(const char *BufferPtr,
140
19
                                          const char *BufferEnd) {
141
73
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr54
) {
142
72
    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143
18
      return BufferPtr;
144
72
  }
145
1
  return BufferEnd;
146
19
}
147
148
const char *skipHexCharacterReference(const char *BufferPtr,
149
23
                                      const char *BufferEnd) {
150
65
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr42
) {
151
63
    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152
21
      return BufferPtr;
153
63
  }
154
2
  return BufferEnd;
155
23
}
156
157
794
bool isHTMLIdentifierStartingCharacter(char C) {
158
794
  return isLetter(C);
159
794
}
160
161
1.41k
bool isHTMLIdentifierCharacter(char C) {
162
1.41k
  return isAlphanumeric(C);
163
1.41k
}
164
165
388
const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166
1.07k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr683
) {
167
1.06k
    if (!isHTMLIdentifierCharacter(*BufferPtr))
168
377
      return BufferPtr;
169
1.06k
  }
170
11
  return BufferEnd;
171
388
}
172
173
/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174
/// string allowed.
175
///
176
/// Returns pointer to closing quote.
177
const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178
48
{
179
48
  const char Quote = *BufferPtr;
180
48
  assert(Quote == '\"' || Quote == '\'');
181
182
0
  BufferPtr++;
183
434
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr386
) {
184
428
    const char C = *BufferPtr;
185
428
    if (C == Quote && 
BufferPtr[-1] != '\\'48
)
186
42
      return BufferPtr;
187
428
  }
188
6
  return BufferEnd;
189
48
}
190
191
652
const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192
832
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr180
) {
193
784
    if (!isWhitespace(*BufferPtr))
194
604
      return BufferPtr;
195
784
  }
196
48
  return BufferEnd;
197
652
}
198
199
56
bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200
56
  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201
56
}
202
203
3.13k
bool isCommandNameStartCharacter(char C) {
204
3.13k
  return isLetter(C);
205
3.13k
}
206
207
19.2k
bool isCommandNameCharacter(char C) {
208
19.2k
  return isAlphanumeric(C);
209
19.2k
}
210
211
3.12k
const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212
19.5k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr16.3k
) {
213
19.2k
    if (!isCommandNameCharacter(*BufferPtr))
214
2.89k
      return BufferPtr;
215
19.2k
  }
216
229
  return BufferEnd;
217
3.12k
}
218
219
/// Return the one past end pointer for BCPL comments.
220
/// Handles newlines escaped with backslash or trigraph for backslahs.
221
3.23k
const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222
3.23k
  const char *CurPtr = BufferPtr;
223
3.24k
  while (CurPtr != BufferEnd) {
224
54.9k
    while (!isVerticalWhitespace(*CurPtr)) {
225
53.6k
      CurPtr++;
226
53.6k
      if (CurPtr == BufferEnd)
227
1.92k
        return BufferEnd;
228
53.6k
    }
229
    // We found a newline, check if it is escaped.
230
1.28k
    const char *EscapePtr = CurPtr - 1;
231
1.30k
    while(isHorizontalWhitespace(*EscapePtr))
232
12
      EscapePtr--;
233
234
1.28k
    if (*EscapePtr == '\\' ||
235
1.28k
        
(1.28k
EscapePtr - 2 >= BufferPtr1.28k
&&
EscapePtr[0] == '/'1.11k
&&
236
1.28k
         
EscapePtr[-1] == '?'3
&&
EscapePtr[-2] == '?'3
)) {
237
      // We found an escaped newline.
238
9
      CurPtr = skipNewline(CurPtr, BufferEnd);
239
9
    } else
240
1.27k
      return CurPtr; // Not an escaped newline.
241
1.28k
  }
242
26
  return BufferEnd;
243
3.23k
}
244
245
/// Return the one past end pointer for C comments.
246
/// Very dumb, does not handle escaped newlines or trigraphs.
247
718
const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248
31.6k
  for ( ; BufferPtr != BufferEnd; 
++BufferPtr30.9k
) {
249
31.6k
    if (*BufferPtr == '*') {
250
1.53k
      assert(BufferPtr + 1 != BufferEnd);
251
1.53k
      if (*(BufferPtr + 1) == '/')
252
718
        return BufferPtr;
253
1.53k
    }
254
31.6k
  }
255
0
  llvm_unreachable("buffer end hit before '*/' was seen");
256
0
}
257
258
} // end anonymous namespace
259
260
void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261
21.0k
                               tok::TokenKind Kind) {
262
21.0k
  const unsigned TokLen = TokEnd - BufferPtr;
263
21.0k
  Result.setLocation(getSourceLocation(BufferPtr));
264
21.0k
  Result.setKind(Kind);
265
21.0k
  Result.setLength(TokLen);
266
21.0k
#ifndef NDEBUG
267
21.0k
  Result.TextPtr = "<UNSET>";
268
21.0k
  Result.IntVal = 7;
269
21.0k
#endif
270
21.0k
  BufferPtr = TokEnd;
271
21.0k
}
272
273
6.49k
const char *Lexer::skipTextToken() {
274
6.49k
  const char *TokenPtr = BufferPtr;
275
6.49k
  assert(TokenPtr < CommentEnd);
276
6.49k
  StringRef TokStartSymbols = ParseCommands ? 
"\n\r\\@\"&<"6.46k
:
"\n\r"28
;
277
278
6.52k
again:
279
6.52k
  size_t End =
280
6.52k
      StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281
6.52k
  if (End == StringRef::npos)
282
2.74k
    return CommentEnd;
283
284
  // Doxygen doesn't recognize any commands in a one-line double quotation.
285
  // If we don't find an ending quotation mark, we pretend it never began.
286
3.78k
  if (*(TokenPtr + End) == '\"') {
287
31
    TokenPtr += End + 1;
288
31
    End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289
31
    if (End != StringRef::npos && 
*(TokenPtr + End) == '\"'17
)
290
17
      TokenPtr += End + 1;
291
31
    goto again;
292
31
  }
293
3.75k
  return TokenPtr + End;
294
3.78k
}
295
296
12.6k
void Lexer::lexCommentText(Token &T) {
297
12.6k
  assert(CommentState == LCS_InsideBCPLComment ||
298
12.6k
         CommentState == LCS_InsideCComment);
299
300
  // Handles lexing non-command text, i.e. text and newline.
301
8.27k
  auto HandleNonCommandToken = [&]() -> void {
302
8.27k
    assert(State == LS_Normal);
303
304
0
    const char *TokenPtr = BufferPtr;
305
8.27k
    assert(TokenPtr < CommentEnd);
306
0
    switch (*TokenPtr) {
307
1.77k
      case '\n':
308
1.77k
      case '\r':
309
1.77k
          TokenPtr = skipNewline(TokenPtr, CommentEnd);
310
1.77k
          formTokenWithChars(T, TokenPtr, tok::newline);
311
312
1.77k
          if (CommentState == LCS_InsideCComment)
313
1.77k
            skipLineStartingDecorations();
314
1.77k
          return;
315
316
6.49k
      default:
317
6.49k
        return formTextToken(T, skipTextToken());
318
8.27k
    }
319
8.27k
  };
320
321
12.6k
  if (!ParseCommands)
322
36
    return HandleNonCommandToken();
323
324
12.6k
  switch (State) {
325
11.8k
  case LS_Normal:
326
11.8k
    break;
327
64
  case LS_VerbatimBlockFirstLine:
328
64
    lexVerbatimBlockFirstLine(T);
329
64
    return;
330
94
  case LS_VerbatimBlockBody:
331
94
    lexVerbatimBlockBody(T);
332
94
    return;
333
153
  case LS_VerbatimLineText:
334
153
    lexVerbatimLineText(T);
335
153
    return;
336
356
  case LS_HTMLStartTag:
337
356
    lexHTMLStartTag(T);
338
356
    return;
339
94
  case LS_HTMLEndTag:
340
94
    lexHTMLEndTag(T);
341
94
    return;
342
12.6k
  }
343
344
11.8k
  assert(State == LS_Normal);
345
0
  const char *TokenPtr = BufferPtr;
346
11.8k
  assert(TokenPtr < CommentEnd);
347
0
  switch(*TokenPtr) {
348
2.62k
    case '\\':
349
3.21k
    case '@': {
350
      // Commands that start with a backslash and commands that start with
351
      // 'at' have equivalent semantics.  But we keep information about the
352
      // exact syntax in AST for comments.
353
3.21k
      tok::TokenKind CommandKind =
354
3.21k
          (*TokenPtr == '@') ? 
tok::at_command592
:
tok::backslash_command2.62k
;
355
3.21k
      TokenPtr++;
356
3.21k
      if (TokenPtr == CommentEnd) {
357
12
        formTextToken(T, TokenPtr);
358
12
        return;
359
12
      }
360
3.20k
      char C = *TokenPtr;
361
3.20k
      switch (C) {
362
3.13k
      default:
363
3.13k
        break;
364
365
3.13k
      
case '\\': 6
case '@': 12
case '&': 18
case '$':
366
50
      
case '#': 31
case '<': 37
case '>': 43
case '%':
367
68
      
case '\"': 56
case '.': 62
case ':':
368
        // This is one of \\ \@ \& \$ etc escape sequences.
369
68
        TokenPtr++;
370
68
        if (C == ':' && 
TokenPtr != CommentEnd6
&&
*TokenPtr == ':'6
) {
371
          // This is the \:: escape sequence.
372
6
          TokenPtr++;
373
6
        }
374
68
        StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375
68
        formTokenWithChars(T, TokenPtr, tok::text);
376
68
        T.setText(UnescapedText);
377
68
        return;
378
3.20k
      }
379
380
      // Don't make zero-length commands.
381
3.13k
      if (!isCommandNameStartCharacter(*TokenPtr)) {
382
8
        formTextToken(T, TokenPtr);
383
8
        return;
384
8
      }
385
386
3.12k
      TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387
3.12k
      unsigned Length = TokenPtr - (BufferPtr + 1);
388
389
      // Hardcoded support for lexing LaTeX formula commands
390
      // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391
3.12k
      if (Length == 1 && 
TokenPtr[-1] == 'f'175
&&
TokenPtr != CommentEnd34
) {
392
34
        C = *TokenPtr;
393
34
        if (C == '$' || 
C == '('17
||
C == ')'12
||
C == '['12
||
C == ']'7
||
394
34
            
C == '{'7
||
C == '}'2
) {
395
32
          TokenPtr++;
396
32
          Length++;
397
32
        }
398
34
      }
399
400
3.12k
      StringRef CommandName(BufferPtr + 1, Length);
401
402
3.12k
      const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403
3.12k
      if (!Info) {
404
316
        if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405
17
          StringRef CorrectedName = Info->Name;
406
17
          SourceLocation Loc = getSourceLocation(BufferPtr);
407
17
          SourceLocation EndLoc = getSourceLocation(TokenPtr);
408
17
          SourceRange FullRange = SourceRange(Loc, EndLoc);
409
17
          SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410
17
          Diag(Loc, diag::warn_correct_comment_command_name)
411
17
            << FullRange << CommandName << CorrectedName
412
17
            << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413
299
        } else {
414
299
          formTokenWithChars(T, TokenPtr, tok::unknown_command);
415
299
          T.setUnknownCommandName(CommandName);
416
299
          Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417
299
              << SourceRange(T.getLocation(), T.getEndLocation());
418
299
          return;
419
299
        }
420
316
      }
421
2.82k
      if (Info->IsVerbatimBlockCommand) {
422
76
        setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423
76
        return;
424
76
      }
425
2.74k
      if (Info->IsVerbatimLineCommand) {
426
157
        setupAndLexVerbatimLine(T, TokenPtr, Info);
427
157
        return;
428
157
      }
429
2.59k
      formTokenWithChars(T, TokenPtr, CommandKind);
430
2.59k
      T.setCommandID(Info->getID());
431
2.59k
      return;
432
2.74k
    }
433
434
100
    case '&':
435
100
      lexHTMLCharacterReference(T);
436
100
      return;
437
438
325
    case '<': {
439
325
      TokenPtr++;
440
325
      if (TokenPtr == CommentEnd) {
441
1
        formTextToken(T, TokenPtr);
442
1
        return;
443
1
      }
444
324
      const char C = *TokenPtr;
445
324
      if (isHTMLIdentifierStartingCharacter(C))
446
203
        setupAndLexHTMLStartTag(T);
447
121
      else if (C == '/')
448
99
        setupAndLexHTMLEndTag(T);
449
22
      else
450
22
        formTextToken(T, TokenPtr);
451
324
      return;
452
325
    }
453
454
8.23k
    default:
455
8.23k
      return HandleNonCommandToken();
456
11.8k
  }
457
11.8k
}
458
459
void Lexer::setupAndLexVerbatimBlock(Token &T,
460
                                     const char *TextBegin,
461
76
                                     char Marker, const CommandInfo *Info) {
462
76
  assert(Info->IsVerbatimBlockCommand);
463
464
0
  VerbatimBlockEndCommandName.clear();
465
76
  VerbatimBlockEndCommandName.append(Marker == '\\' ? 
"\\"70
:
"@"6
);
466
76
  VerbatimBlockEndCommandName.append(Info->EndCommandName);
467
468
76
  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469
76
  T.setVerbatimBlockID(Info->getID());
470
471
  // If there is a newline following the verbatim opening command, skip the
472
  // newline so that we don't create an tok::verbatim_block_line with empty
473
  // text content.
474
76
  if (BufferPtr != CommentEnd &&
475
76
      
isVerticalWhitespace(*BufferPtr)58
) {
476
10
    BufferPtr = skipNewline(BufferPtr, CommentEnd);
477
10
    State = LS_VerbatimBlockBody;
478
10
    return;
479
10
  }
480
481
66
  State = LS_VerbatimBlockFirstLine;
482
66
}
483
484
154
void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485
173
again:
486
173
  assert(BufferPtr < CommentEnd);
487
488
  // FIXME: It would be better to scan the text once, finding either the block
489
  // end command or newline.
490
  //
491
  // Extract current line.
492
0
  const char *Newline = findNewline(BufferPtr, CommentEnd);
493
173
  StringRef Line(BufferPtr, Newline - BufferPtr);
494
495
  // Look for end command in current line.
496
173
  size_t Pos = Line.find(VerbatimBlockEndCommandName);
497
173
  const char *TextEnd;
498
173
  const char *NextLine;
499
173
  if (Pos == StringRef::npos) {
500
    // Current line is completely verbatim.
501
54
    TextEnd = Newline;
502
54
    NextLine = skipNewline(Newline, CommentEnd);
503
119
  } else if (Pos == 0) {
504
    // Current line contains just an end command.
505
63
    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506
63
    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507
63
    formTokenWithChars(T, End, tok::verbatim_block_end);
508
63
    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509
63
    State = LS_Normal;
510
63
    return;
511
63
  } else {
512
    // There is some text, followed by end command.  Extract text first.
513
56
    TextEnd = BufferPtr + Pos;
514
56
    NextLine = TextEnd;
515
    // If there is only whitespace before end command, skip whitespace.
516
56
    if (isWhitespace(BufferPtr, TextEnd)) {
517
19
      BufferPtr = TextEnd;
518
19
      goto again;
519
19
    }
520
56
  }
521
522
91
  StringRef Text(BufferPtr, TextEnd - BufferPtr);
523
91
  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524
91
  T.setVerbatimBlockText(Text);
525
526
91
  State = LS_VerbatimBlockBody;
527
91
}
528
529
94
void Lexer::lexVerbatimBlockBody(Token &T) {
530
94
  assert(State == LS_VerbatimBlockBody);
531
532
94
  if (CommentState == LCS_InsideCComment)
533
50
    skipLineStartingDecorations();
534
535
94
  if (BufferPtr == CommentEnd) {
536
4
    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537
4
    T.setVerbatimBlockText("");
538
4
    return;
539
4
  }
540
541
90
  lexVerbatimBlockFirstLine(T);
542
90
}
543
544
void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545
157
                                    const CommandInfo *Info) {
546
157
  assert(Info->IsVerbatimLineCommand);
547
0
  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548
157
  T.setVerbatimLineID(Info->getID());
549
550
157
  State = LS_VerbatimLineText;
551
157
}
552
553
153
void Lexer::lexVerbatimLineText(Token &T) {
554
153
  assert(State == LS_VerbatimLineText);
555
556
  // Extract current line.
557
0
  const char *Newline = findNewline(BufferPtr, CommentEnd);
558
153
  StringRef Text(BufferPtr, Newline - BufferPtr);
559
153
  formTokenWithChars(T, Newline, tok::verbatim_line_text);
560
153
  T.setVerbatimLineText(Text);
561
562
153
  State = LS_Normal;
563
153
}
564
565
100
void Lexer::lexHTMLCharacterReference(Token &T) {
566
100
  const char *TokenPtr = BufferPtr;
567
100
  assert(*TokenPtr == '&');
568
0
  TokenPtr++;
569
100
  if (TokenPtr == CommentEnd) {
570
1
    formTextToken(T, TokenPtr);
571
1
    return;
572
1
  }
573
99
  const char *NamePtr;
574
99
  bool isNamed = false;
575
99
  bool isDecimal = false;
576
99
  char C = *TokenPtr;
577
99
  if (isHTMLNamedCharacterReferenceCharacter(C)) {
578
54
    NamePtr = TokenPtr;
579
54
    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580
54
    isNamed = true;
581
54
  } else 
if (45
C == '#'45
) {
582
44
    TokenPtr++;
583
44
    if (TokenPtr == CommentEnd) {
584
1
      formTextToken(T, TokenPtr);
585
1
      return;
586
1
    }
587
43
    C = *TokenPtr;
588
43
    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589
19
      NamePtr = TokenPtr;
590
19
      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591
19
      isDecimal = true;
592
24
    } else if (C == 'x' || 
C == 'X'7
) {
593
23
      TokenPtr++;
594
23
      NamePtr = TokenPtr;
595
23
      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596
23
    } else {
597
1
      formTextToken(T, TokenPtr);
598
1
      return;
599
1
    }
600
43
  } else {
601
1
    formTextToken(T, TokenPtr);
602
1
    return;
603
1
  }
604
96
  if (NamePtr == TokenPtr || 
TokenPtr == CommentEnd94
||
605
96
      
*TokenPtr != ';'91
) {
606
8
    formTextToken(T, TokenPtr);
607
8
    return;
608
8
  }
609
88
  StringRef Name(NamePtr, TokenPtr - NamePtr);
610
88
  TokenPtr++; // Skip semicolon.
611
88
  StringRef Resolved;
612
88
  if (isNamed)
613
52
    Resolved = resolveHTMLNamedCharacterReference(Name);
614
36
  else if (isDecimal)
615
17
    Resolved = resolveHTMLDecimalCharacterReference(Name);
616
19
  else
617
19
    Resolved = resolveHTMLHexCharacterReference(Name);
618
619
88
  if (Resolved.empty()) {
620
0
    formTextToken(T, TokenPtr);
621
0
    return;
622
0
  }
623
88
  formTokenWithChars(T, TokenPtr, tok::text);
624
88
  T.setText(Resolved);
625
88
}
626
627
203
void Lexer::setupAndLexHTMLStartTag(Token &T) {
628
203
  assert(BufferPtr[0] == '<' &&
629
203
         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630
0
  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631
203
  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632
203
  if (!isHTMLTagName(Name)) {
633
8
    formTextToken(T, TagNameEnd);
634
8
    return;
635
8
  }
636
637
195
  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638
195
  T.setHTMLTagStartName(Name);
639
640
195
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641
642
195
  const char C = *BufferPtr;
643
195
  if (BufferPtr != CommentEnd &&
644
195
      
(192
C == '>'192
||
C == '/'101
||
isHTMLIdentifierStartingCharacter(C)84
))
645
182
    State = LS_HTMLStartTag;
646
195
}
647
648
356
void Lexer::lexHTMLStartTag(Token &T) {
649
356
  assert(State == LS_HTMLStartTag);
650
651
0
  const char *TokenPtr = BufferPtr;
652
356
  char C = *TokenPtr;
653
356
  if (isHTMLIdentifierCharacter(C)) {
654
86
    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655
86
    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656
86
    formTokenWithChars(T, TokenPtr, tok::html_ident);
657
86
    T.setHTMLIdent(Ident);
658
270
  } else {
659
270
    switch (C) {
660
72
    case '=':
661
72
      TokenPtr++;
662
72
      formTokenWithChars(T, TokenPtr, tok::html_equals);
663
72
      break;
664
43
    case '\"':
665
48
    case '\'': {
666
48
      const char *OpenQuote = TokenPtr;
667
48
      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668
48
      const char *ClosingQuote = TokenPtr;
669
48
      if (TokenPtr != CommentEnd) // Skip closing quote.
670
42
        TokenPtr++;
671
48
      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672
48
      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673
48
                                      ClosingQuote - (OpenQuote + 1)));
674
48
      break;
675
43
    }
676
133
    case '>':
677
133
      TokenPtr++;
678
133
      formTokenWithChars(T, TokenPtr, tok::html_greater);
679
133
      State = LS_Normal;
680
133
      return;
681
17
    case '/':
682
17
      TokenPtr++;
683
17
      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684
15
        TokenPtr++;
685
15
        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686
15
      } else
687
2
        formTextToken(T, TokenPtr);
688
689
17
      State = LS_Normal;
690
17
      return;
691
270
    }
692
270
  }
693
694
  // Now look ahead and return to normal state if we don't see any HTML tokens
695
  // ahead.
696
206
  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697
206
  if (BufferPtr == CommentEnd) {
698
23
    State = LS_Normal;
699
23
    return;
700
23
  }
701
702
183
  C = *BufferPtr;
703
183
  if (!isHTMLIdentifierStartingCharacter(C) &&
704
183
      
C != '='171
&&
C != '\"'99
&&
C != '\''56
&&
C != '>'51
) {
705
9
    State = LS_Normal;
706
9
    return;
707
9
  }
708
183
}
709
710
99
void Lexer::setupAndLexHTMLEndTag(Token &T) {
711
99
  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712
713
0
  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714
99
  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715
99
  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716
99
  if (!isHTMLTagName(Name)) {
717
3
    formTextToken(T, TagNameEnd);
718
3
    return;
719
3
  }
720
721
96
  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722
723
96
  formTokenWithChars(T, End, tok::html_end_tag);
724
96
  T.setHTMLTagEndName(Name);
725
726
96
  if (BufferPtr != CommentEnd && 
*BufferPtr == '>'94
)
727
94
    State = LS_HTMLEndTag;
728
96
}
729
730
94
void Lexer::lexHTMLEndTag(Token &T) {
731
94
  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732
733
0
  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734
94
  State = LS_Normal;
735
94
}
736
737
Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738
             const CommandTraits &Traits, SourceLocation FileLoc,
739
             const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740
    : Allocator(Allocator), Diags(Diags), Traits(Traits),
741
      BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742
      FileLoc(FileLoc), ParseCommands(ParseCommands),
743
2.72k
      CommentState(LCS_BeforeComment), State(LS_Normal) {}
744
745
21.0k
void Lexer::lex(Token &T) {
746
28.2k
again:
747
28.2k
  switch (CommentState) {
748
7.78k
  case LCS_BeforeComment:
749
7.78k
    if (BufferPtr == BufferEnd) {
750
3.83k
      formTokenWithChars(T, BufferPtr, tok::eof);
751
3.83k
      return;
752
3.83k
    }
753
754
3.95k
    assert(*BufferPtr == '/');
755
0
    BufferPtr++; // Skip first slash.
756
3.95k
    switch(*BufferPtr) {
757
3.23k
    case '/': { // BCPL comment.
758
3.23k
      BufferPtr++; // Skip second slash.
759
760
3.23k
      if (BufferPtr != BufferEnd) {
761
        // Skip Doxygen magic marker, if it is present.
762
        // It might be missing because of a typo //< or /*<, or because we
763
        // merged this non-Doxygen comment into a bunch of Doxygen comments
764
        // around it: /** ... */ /* ... */ /** ... */
765
3.22k
        const char C = *BufferPtr;
766
3.22k
        if (C == '/' || 
C == '!'303
)
767
2.96k
          BufferPtr++;
768
3.22k
      }
769
770
      // Skip less-than symbol that marks trailing comments.
771
      // Skip it even if the comment is not a Doxygen one, because //< and /*<
772
      // are frequent typos.
773
3.23k
      if (BufferPtr != BufferEnd && 
*BufferPtr == '<'3.21k
)
774
137
        BufferPtr++;
775
776
3.23k
      CommentState = LCS_InsideBCPLComment;
777
3.23k
      if (State != LS_VerbatimBlockBody && 
State != LS_VerbatimBlockFirstLine3.20k
)
778
3.19k
        State = LS_Normal;
779
3.23k
      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780
3.23k
      goto again;
781
0
    }
782
718
    case '*': { // C comment.
783
718
      BufferPtr++; // Skip star.
784
785
      // Skip Doxygen magic marker.
786
718
      const char C = *BufferPtr;
787
718
      if ((C == '*' && 
*(BufferPtr + 1) != '/'567
) ||
C == '!'152
)
788
693
        BufferPtr++;
789
790
      // Skip less-than symbol that marks trailing comments.
791
718
      if (BufferPtr != BufferEnd && *BufferPtr == '<')
792
40
        BufferPtr++;
793
794
718
      CommentState = LCS_InsideCComment;
795
718
      State = LS_Normal;
796
718
      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797
718
      goto again;
798
0
    }
799
0
    default:
800
0
      llvm_unreachable("second character of comment should be '/' or '*'");
801
3.95k
    }
802
803
3.87k
  case LCS_BetweenComments: {
804
    // Consecutive comments are extracted only if there is only whitespace
805
    // between them.  So we can search for the start of the next comment.
806
3.87k
    const char *EndWhitespace = BufferPtr;
807
5.41k
    while(EndWhitespace != BufferEnd && 
*EndWhitespace != '/'2.77k
)
808
1.54k
      EndWhitespace++;
809
810
    // Turn any whitespace between comments (and there is only whitespace
811
    // between them -- guaranteed by comment extraction) into a newline.  We
812
    // have two newlines between C comments in total (first one was synthesized
813
    // after a comment).
814
3.87k
    formTokenWithChars(T, EndWhitespace, tok::newline);
815
816
3.87k
    CommentState = LCS_BeforeComment;
817
3.87k
    break;
818
3.95k
  }
819
820
11.2k
  case LCS_InsideBCPLComment:
821
16.5k
  case LCS_InsideCComment:
822
16.5k
    if (BufferPtr != CommentEnd) {
823
12.6k
      lexCommentText(T);
824
12.6k
      break;
825
12.6k
    } else {
826
      // Skip C comment closing sequence.
827
3.87k
      if (CommentState == LCS_InsideCComment) {
828
715
        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829
0
        BufferPtr += 2;
830
715
        assert(BufferPtr <= BufferEnd);
831
832
        // Synthenize newline just after the C comment, regardless if there is
833
        // actually a newline.
834
0
        formTokenWithChars(T, BufferPtr, tok::newline);
835
836
715
        CommentState = LCS_BetweenComments;
837
715
        break;
838
3.15k
      } else {
839
        // Don't synthesized a newline after BCPL comment.
840
3.15k
        CommentState = LCS_BetweenComments;
841
3.15k
        goto again;
842
3.15k
      }
843
3.87k
    }
844
28.2k
  }
845
28.2k
}
846
847
StringRef Lexer::getSpelling(const Token &Tok,
848
28
                             const SourceManager &SourceMgr) const {
849
28
  SourceLocation Loc = Tok.getLocation();
850
28
  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851
852
28
  bool InvalidTemp = false;
853
28
  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854
28
  if (InvalidTemp)
855
0
    return StringRef();
856
857
28
  const char *Begin = File.data() + LocInfo.second;
858
28
  return StringRef(Begin, Tok.getLength());
859
28
}
860
861
} // end namespace comments
862
} // end namespace clang