/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/AST/CommentLexer.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- CommentLexer.cpp -------------------------------------------------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | |
9 | | #include "clang/AST/CommentLexer.h" |
10 | | #include "clang/AST/CommentCommandTraits.h" |
11 | | #include "clang/AST/CommentDiagnostic.h" |
12 | | #include "clang/Basic/CharInfo.h" |
13 | | #include "llvm/ADT/StringExtras.h" |
14 | | #include "llvm/ADT/StringSwitch.h" |
15 | | #include "llvm/Support/ConvertUTF.h" |
16 | | #include "llvm/Support/ErrorHandling.h" |
17 | | |
18 | | namespace clang { |
19 | | namespace comments { |
20 | | |
21 | 0 | void Token::dump(const Lexer &L, const SourceManager &SM) const { |
22 | 0 | llvm::errs() << "comments::Token Kind=" << Kind << " "; |
23 | 0 | Loc.print(llvm::errs(), SM); |
24 | 0 | llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; |
25 | 0 | } |
26 | | |
27 | 341 | static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { |
28 | 341 | return isLetter(C); |
29 | 341 | } |
30 | | |
31 | 165 | static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { |
32 | 165 | return isDigit(C); |
33 | 165 | } |
34 | | |
35 | 101 | static inline bool isHTMLHexCharacterReferenceCharacter(char C) { |
36 | 101 | return isHexDigit(C); |
37 | 101 | } |
38 | | |
39 | | static inline StringRef convertCodePointToUTF8( |
40 | | llvm::BumpPtrAllocator &Allocator, |
41 | 36 | unsigned CodePoint) { |
42 | 36 | char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); |
43 | 36 | char *ResolvedPtr = Resolved; |
44 | 36 | if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) |
45 | 36 | return StringRef(Resolved, ResolvedPtr - Resolved); |
46 | 0 | else |
47 | 0 | return StringRef(); |
48 | 36 | } |
49 | | |
50 | | namespace { |
51 | | |
52 | | #include "clang/AST/CommentHTMLTags.inc" |
53 | | #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" |
54 | | |
55 | | } // end anonymous namespace |
56 | | |
57 | 52 | StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { |
58 | | // Fast path, first check a few most widely used named character references. |
59 | 52 | return llvm::StringSwitch<StringRef>(Name) |
60 | 52 | .Case("amp", "&") |
61 | 52 | .Case("lt", "<") |
62 | 52 | .Case("gt", ">") |
63 | 52 | .Case("quot", "\"") |
64 | 52 | .Case("apos", "\'") |
65 | | // Slow path. |
66 | 52 | .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); |
67 | 52 | } |
68 | | |
69 | 17 | StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { |
70 | 17 | unsigned CodePoint = 0; |
71 | 67 | for (unsigned i = 0, e = Name.size(); i != e; ++i50 ) { |
72 | 50 | assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); |
73 | 50 | CodePoint *= 10; |
74 | 50 | CodePoint += Name[i] - '0'; |
75 | 50 | } |
76 | 17 | return convertCodePointToUTF8(Allocator, CodePoint); |
77 | 17 | } |
78 | | |
79 | 19 | StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { |
80 | 19 | unsigned CodePoint = 0; |
81 | 57 | for (unsigned i = 0, e = Name.size(); i != e; ++i38 ) { |
82 | 38 | CodePoint *= 16; |
83 | 38 | const char C = Name[i]; |
84 | 38 | assert(isHTMLHexCharacterReferenceCharacter(C)); |
85 | 38 | CodePoint += llvm::hexDigitValue(C); |
86 | 38 | } |
87 | 19 | return convertCodePointToUTF8(Allocator, CodePoint); |
88 | 19 | } |
89 | | |
90 | 1.73k | void Lexer::skipLineStartingDecorations() { |
91 | | // This function should be called only for C comments |
92 | 1.73k | assert(CommentState == LCS_InsideCComment); |
93 | | |
94 | 1.73k | if (BufferPtr == CommentEnd) |
95 | 381 | return; |
96 | | |
97 | 1.35k | switch (*BufferPtr) { |
98 | 993 | case ' ': |
99 | 995 | case '\t': |
100 | 995 | case '\f': |
101 | 995 | case '\v': { |
102 | 995 | const char *NewBufferPtr = BufferPtr; |
103 | 995 | NewBufferPtr++; |
104 | 995 | if (NewBufferPtr == CommentEnd) |
105 | 122 | return; |
106 | | |
107 | 873 | char C = *NewBufferPtr; |
108 | 1.21k | while (isHorizontalWhitespace(C)) { |
109 | 364 | NewBufferPtr++; |
110 | 364 | if (NewBufferPtr == CommentEnd) |
111 | 18 | return; |
112 | 346 | C = *NewBufferPtr; |
113 | 346 | } |
114 | 855 | if (C == '*') |
115 | 716 | BufferPtr = NewBufferPtr + 1; |
116 | 855 | break; |
117 | 873 | } |
118 | 8 | case '*': |
119 | 8 | BufferPtr++; |
120 | 8 | break; |
121 | 1.35k | } |
122 | 1.35k | } |
123 | | |
124 | | namespace { |
125 | | /// Returns pointer to the first newline character in the string. |
126 | 286 | const char *findNewline(const char *BufferPtr, const char *BufferEnd) { |
127 | 4.93k | for ( ; BufferPtr != BufferEnd; ++BufferPtr4.64k ) { |
128 | 4.77k | if (isVerticalWhitespace(*BufferPtr)) |
129 | 126 | return BufferPtr; |
130 | 4.77k | } |
131 | 160 | return BufferEnd; |
132 | 286 | } |
133 | | |
134 | 1.76k | const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { |
135 | 1.76k | if (BufferPtr == BufferEnd) |
136 | 29 | return BufferPtr; |
137 | | |
138 | 1.73k | if (*BufferPtr == '\n') |
139 | 1.71k | BufferPtr++; |
140 | 15 | else { |
141 | 15 | assert(*BufferPtr == '\r'); |
142 | 15 | BufferPtr++; |
143 | 15 | if (BufferPtr != BufferEnd && *BufferPtr == '\n'13 ) |
144 | 8 | BufferPtr++; |
145 | 15 | } |
146 | 1.73k | return BufferPtr; |
147 | 1.73k | } |
148 | | |
149 | | const char *skipNamedCharacterReference(const char *BufferPtr, |
150 | 54 | const char *BufferEnd) { |
151 | 243 | for ( ; BufferPtr != BufferEnd; ++BufferPtr189 ) { |
152 | 242 | if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) |
153 | 53 | return BufferPtr; |
154 | 242 | } |
155 | 1 | return BufferEnd; |
156 | 54 | } |
157 | | |
158 | | const char *skipDecimalCharacterReference(const char *BufferPtr, |
159 | 19 | const char *BufferEnd) { |
160 | 73 | for ( ; BufferPtr != BufferEnd; ++BufferPtr54 ) { |
161 | 72 | if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) |
162 | 18 | return BufferPtr; |
163 | 72 | } |
164 | 1 | return BufferEnd; |
165 | 19 | } |
166 | | |
167 | | const char *skipHexCharacterReference(const char *BufferPtr, |
168 | 23 | const char *BufferEnd) { |
169 | 65 | for ( ; BufferPtr != BufferEnd; ++BufferPtr42 ) { |
170 | 63 | if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) |
171 | 21 | return BufferPtr; |
172 | 63 | } |
173 | 2 | return BufferEnd; |
174 | 23 | } |
175 | | |
176 | 794 | bool isHTMLIdentifierStartingCharacter(char C) { |
177 | 794 | return isLetter(C); |
178 | 794 | } |
179 | | |
180 | 1.41k | bool isHTMLIdentifierCharacter(char C) { |
181 | 1.41k | return isAlphanumeric(C); |
182 | 1.41k | } |
183 | | |
184 | 388 | const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { |
185 | 1.07k | for ( ; BufferPtr != BufferEnd; ++BufferPtr683 ) { |
186 | 1.06k | if (!isHTMLIdentifierCharacter(*BufferPtr)) |
187 | 377 | return BufferPtr; |
188 | 1.06k | } |
189 | 11 | return BufferEnd; |
190 | 388 | } |
191 | | |
192 | | /// Skip HTML string quoted in single or double quotes. Escaping quotes inside |
193 | | /// string allowed. |
194 | | /// |
195 | | /// Returns pointer to closing quote. |
196 | | const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) |
197 | 48 | { |
198 | 48 | const char Quote = *BufferPtr; |
199 | 48 | assert(Quote == '\"' || Quote == '\''); |
200 | | |
201 | 48 | BufferPtr++; |
202 | 434 | for ( ; BufferPtr != BufferEnd; ++BufferPtr386 ) { |
203 | 428 | const char C = *BufferPtr; |
204 | 428 | if (C == Quote && BufferPtr[-1] != '\\'48 ) |
205 | 42 | return BufferPtr; |
206 | 428 | } |
207 | 6 | return BufferEnd; |
208 | 48 | } |
209 | | |
210 | 635 | const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { |
211 | 802 | for ( ; BufferPtr != BufferEnd; ++BufferPtr167 ) { |
212 | 754 | if (!isWhitespace(*BufferPtr)) |
213 | 587 | return BufferPtr; |
214 | 754 | } |
215 | 48 | return BufferEnd; |
216 | 635 | } |
217 | | |
218 | 39 | bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { |
219 | 39 | return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; |
220 | 39 | } |
221 | | |
222 | 3.06k | bool isCommandNameStartCharacter(char C) { |
223 | 3.06k | return isLetter(C); |
224 | 3.06k | } |
225 | | |
226 | 18.9k | bool isCommandNameCharacter(char C) { |
227 | 18.9k | return isAlphanumeric(C); |
228 | 18.9k | } |
229 | | |
230 | 3.05k | const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { |
231 | 19.1k | for ( ; BufferPtr != BufferEnd; ++BufferPtr16.0k ) { |
232 | 18.9k | if (!isCommandNameCharacter(*BufferPtr)) |
233 | 2.83k | return BufferPtr; |
234 | 18.9k | } |
235 | 222 | return BufferEnd; |
236 | 3.05k | } |
237 | | |
238 | | /// Return the one past end pointer for BCPL comments. |
239 | | /// Handles newlines escaped with backslash or trigraph for backslahs. |
240 | 3.19k | const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
241 | 3.19k | const char *CurPtr = BufferPtr; |
242 | 3.20k | while (CurPtr != BufferEnd) { |
243 | 54.0k | while (!isVerticalWhitespace(*CurPtr)) { |
244 | 52.7k | CurPtr++; |
245 | 52.7k | if (CurPtr == BufferEnd) |
246 | 1.91k | return BufferEnd; |
247 | 52.7k | } |
248 | | // We found a newline, check if it is escaped. |
249 | 1.26k | const char *EscapePtr = CurPtr - 1; |
250 | 1.27k | while(isHorizontalWhitespace(*EscapePtr)) |
251 | 12 | EscapePtr--; |
252 | | |
253 | 1.26k | if (*EscapePtr == '\\' || |
254 | 1.25k | (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/'1.09k && |
255 | 9 | EscapePtr[-1] == '?'3 && EscapePtr[-2] == '?'3 )) { |
256 | | // We found an escaped newline. |
257 | 9 | CurPtr = skipNewline(CurPtr, BufferEnd); |
258 | 9 | } else |
259 | 1.25k | return CurPtr; // Not an escaped newline. |
260 | 1.26k | } |
261 | 25 | return BufferEnd; |
262 | 3.19k | } |
263 | | |
264 | | /// Return the one past end pointer for C comments. |
265 | | /// Very dumb, does not handle escaped newlines or trigraphs. |
266 | 704 | const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { |
267 | 30.2k | for ( ; BufferPtr != BufferEnd; ++BufferPtr29.5k ) { |
268 | 30.2k | if (*BufferPtr == '*') { |
269 | 1.44k | assert(BufferPtr + 1 != BufferEnd); |
270 | 1.44k | if (*(BufferPtr + 1) == '/') |
271 | 704 | return BufferPtr; |
272 | 1.44k | } |
273 | 30.2k | } |
274 | 704 | llvm_unreachable0 ("buffer end hit before '*/' was seen"); |
275 | 704 | } |
276 | | |
277 | | } // end anonymous namespace |
278 | | |
279 | | void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, |
280 | 20.6k | tok::TokenKind Kind) { |
281 | 20.6k | const unsigned TokLen = TokEnd - BufferPtr; |
282 | 20.6k | Result.setLocation(getSourceLocation(BufferPtr)); |
283 | 20.6k | Result.setKind(Kind); |
284 | 20.6k | Result.setLength(TokLen); |
285 | 20.6k | #ifndef NDEBUG |
286 | 20.6k | Result.TextPtr = "<UNSET>"; |
287 | 20.6k | Result.IntVal = 7; |
288 | 20.6k | #endif |
289 | 20.6k | BufferPtr = TokEnd; |
290 | 20.6k | } |
291 | | |
292 | 12.3k | void Lexer::lexCommentText(Token &T) { |
293 | 12.3k | assert(CommentState == LCS_InsideBCPLComment || |
294 | 12.3k | CommentState == LCS_InsideCComment); |
295 | | |
296 | | // Handles lexing non-command text, i.e. text and newline. |
297 | 8.03k | auto HandleNonCommandToken = [&]() -> void { |
298 | 8.03k | assert(State == LS_Normal); |
299 | | |
300 | 8.03k | const char *TokenPtr = BufferPtr; |
301 | 8.03k | assert(TokenPtr < CommentEnd); |
302 | 8.03k | switch (*TokenPtr) { |
303 | 1.68k | case '\n': |
304 | 1.69k | case '\r': |
305 | 1.69k | TokenPtr = skipNewline(TokenPtr, CommentEnd); |
306 | 1.69k | formTokenWithChars(T, TokenPtr, tok::newline); |
307 | | |
308 | 1.69k | if (CommentState == LCS_InsideCComment) |
309 | 1.68k | skipLineStartingDecorations(); |
310 | 1.69k | return; |
311 | | |
312 | 6.34k | default: { |
313 | 6.31k | StringRef TokStartSymbols = ParseCommands ? "\n\r\\@&<" : "\n\r"28 ; |
314 | 6.34k | size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr) |
315 | 6.34k | .find_first_of(TokStartSymbols); |
316 | 6.34k | if (End != StringRef::npos) |
317 | 3.62k | TokenPtr += End; |
318 | 2.71k | else |
319 | 2.71k | TokenPtr = CommentEnd; |
320 | 6.34k | formTextToken(T, TokenPtr); |
321 | 6.34k | return; |
322 | 1.68k | } |
323 | 8.03k | } |
324 | 8.03k | }; |
325 | | |
326 | 12.3k | if (!ParseCommands) |
327 | 36 | return HandleNonCommandToken(); |
328 | | |
329 | 12.2k | switch (State) { |
330 | 11.5k | case LS_Normal: |
331 | 11.5k | break; |
332 | 47 | case LS_VerbatimBlockFirstLine: |
333 | 47 | lexVerbatimBlockFirstLine(T); |
334 | 47 | return; |
335 | 77 | case LS_VerbatimBlockBody: |
336 | 77 | lexVerbatimBlockBody(T); |
337 | 77 | return; |
338 | 147 | case LS_VerbatimLineText: |
339 | 147 | lexVerbatimLineText(T); |
340 | 147 | return; |
341 | 356 | case LS_HTMLStartTag: |
342 | 356 | lexHTMLStartTag(T); |
343 | 356 | return; |
344 | 94 | case LS_HTMLEndTag: |
345 | 94 | lexHTMLEndTag(T); |
346 | 94 | return; |
347 | 11.5k | } |
348 | | |
349 | 11.5k | assert(State == LS_Normal); |
350 | 11.5k | const char *TokenPtr = BufferPtr; |
351 | 11.5k | assert(TokenPtr < CommentEnd); |
352 | 11.5k | switch(*TokenPtr) { |
353 | 2.58k | case '\\': |
354 | 3.14k | case '@': { |
355 | | // Commands that start with a backslash and commands that start with |
356 | | // 'at' have equivalent semantics. But we keep information about the |
357 | | // exact syntax in AST for comments. |
358 | 3.14k | tok::TokenKind CommandKind = |
359 | 2.58k | (*TokenPtr == '@') ? tok::at_command563 : tok::backslash_command; |
360 | 3.14k | TokenPtr++; |
361 | 3.14k | if (TokenPtr == CommentEnd) { |
362 | 12 | formTextToken(T, TokenPtr); |
363 | 12 | return; |
364 | 12 | } |
365 | 3.13k | char C = *TokenPtr; |
366 | 3.13k | switch (C) { |
367 | 3.06k | default: |
368 | 3.06k | break; |
369 | | |
370 | 25 | case '\\': 6 case '@': 12 case '&': 18 case '$': |
371 | 50 | case '#': 31 case '<': 37 case '>': 43 case '%': |
372 | 68 | case '\"': 56 case '.': 62 case ':': |
373 | | // This is one of \\ \@ \& \$ etc escape sequences. |
374 | 68 | TokenPtr++; |
375 | 68 | if (C == ':' && TokenPtr != CommentEnd6 && *TokenPtr == ':'6 ) { |
376 | | // This is the \:: escape sequence. |
377 | 6 | TokenPtr++; |
378 | 6 | } |
379 | 68 | StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); |
380 | 68 | formTokenWithChars(T, TokenPtr, tok::text); |
381 | 68 | T.setText(UnescapedText); |
382 | 68 | return; |
383 | 3.06k | } |
384 | | |
385 | | // Don't make zero-length commands. |
386 | 3.06k | if (!isCommandNameStartCharacter(*TokenPtr)) { |
387 | 8 | formTextToken(T, TokenPtr); |
388 | 8 | return; |
389 | 8 | } |
390 | | |
391 | 3.05k | TokenPtr = skipCommandName(TokenPtr, CommentEnd); |
392 | 3.05k | unsigned Length = TokenPtr - (BufferPtr + 1); |
393 | | |
394 | | // Hardcoded support for lexing LaTeX formula commands |
395 | | // \f$ \f[ \f] \f{ \f} as a single command. |
396 | 3.05k | if (Length == 1 && TokenPtr[-1] == 'f'158 && TokenPtr != CommentEnd17 ) { |
397 | 17 | C = *TokenPtr; |
398 | 17 | if (C == '$' || C == '['4 || C == ']'3 || C == '{'3 || C == '}'2 ) { |
399 | 15 | TokenPtr++; |
400 | 15 | Length++; |
401 | 15 | } |
402 | 17 | } |
403 | | |
404 | 3.05k | StringRef CommandName(BufferPtr + 1, Length); |
405 | | |
406 | 3.05k | const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); |
407 | 3.05k | if (!Info) { |
408 | 312 | if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { |
409 | 11 | StringRef CorrectedName = Info->Name; |
410 | 11 | SourceLocation Loc = getSourceLocation(BufferPtr); |
411 | 11 | SourceLocation EndLoc = getSourceLocation(TokenPtr); |
412 | 11 | SourceRange FullRange = SourceRange(Loc, EndLoc); |
413 | 11 | SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc); |
414 | 11 | Diag(Loc, diag::warn_correct_comment_command_name) |
415 | 11 | << FullRange << CommandName << CorrectedName |
416 | 11 | << FixItHint::CreateReplacement(CommandRange, CorrectedName); |
417 | 301 | } else { |
418 | 301 | formTokenWithChars(T, TokenPtr, tok::unknown_command); |
419 | 301 | T.setUnknownCommandName(CommandName); |
420 | 301 | Diag(T.getLocation(), diag::warn_unknown_comment_command_name) |
421 | 301 | << SourceRange(T.getLocation(), T.getEndLocation()); |
422 | 301 | return; |
423 | 301 | } |
424 | 2.75k | } |
425 | 2.75k | if (Info->IsVerbatimBlockCommand) { |
426 | 59 | setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); |
427 | 59 | return; |
428 | 59 | } |
429 | 2.69k | if (Info->IsVerbatimLineCommand) { |
430 | 151 | setupAndLexVerbatimLine(T, TokenPtr, Info); |
431 | 151 | return; |
432 | 151 | } |
433 | 2.54k | formTokenWithChars(T, TokenPtr, CommandKind); |
434 | 2.54k | T.setCommandID(Info->getID()); |
435 | 2.54k | return; |
436 | 2.54k | } |
437 | | |
438 | 100 | case '&': |
439 | 100 | lexHTMLCharacterReference(T); |
440 | 100 | return; |
441 | | |
442 | 325 | case '<': { |
443 | 325 | TokenPtr++; |
444 | 325 | if (TokenPtr == CommentEnd) { |
445 | 1 | formTextToken(T, TokenPtr); |
446 | 1 | return; |
447 | 1 | } |
448 | 324 | const char C = *TokenPtr; |
449 | 324 | if (isHTMLIdentifierStartingCharacter(C)) |
450 | 203 | setupAndLexHTMLStartTag(T); |
451 | 121 | else if (C == '/') |
452 | 99 | setupAndLexHTMLEndTag(T); |
453 | 22 | else |
454 | 22 | formTextToken(T, TokenPtr); |
455 | 324 | return; |
456 | 324 | } |
457 | | |
458 | 7.99k | default: |
459 | 7.99k | return HandleNonCommandToken(); |
460 | 11.5k | } |
461 | 11.5k | } |
462 | | |
463 | | void Lexer::setupAndLexVerbatimBlock(Token &T, |
464 | | const char *TextBegin, |
465 | 59 | char Marker, const CommandInfo *Info) { |
466 | 59 | assert(Info->IsVerbatimBlockCommand); |
467 | | |
468 | 59 | VerbatimBlockEndCommandName.clear(); |
469 | 53 | VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"6 ); |
470 | 59 | VerbatimBlockEndCommandName.append(Info->EndCommandName); |
471 | | |
472 | 59 | formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); |
473 | 59 | T.setVerbatimBlockID(Info->getID()); |
474 | | |
475 | | // If there is a newline following the verbatim opening command, skip the |
476 | | // newline so that we don't create an tok::verbatim_block_line with empty |
477 | | // text content. |
478 | 59 | if (BufferPtr != CommentEnd && |
479 | 41 | isVerticalWhitespace(*BufferPtr)) { |
480 | 10 | BufferPtr = skipNewline(BufferPtr, CommentEnd); |
481 | 10 | State = LS_VerbatimBlockBody; |
482 | 10 | return; |
483 | 10 | } |
484 | | |
485 | 49 | State = LS_VerbatimBlockFirstLine; |
486 | 49 | } |
487 | | |
488 | 120 | void Lexer::lexVerbatimBlockFirstLine(Token &T) { |
489 | 139 | again: |
490 | 139 | assert(BufferPtr < CommentEnd); |
491 | | |
492 | | // FIXME: It would be better to scan the text once, finding either the block |
493 | | // end command or newline. |
494 | | // |
495 | | // Extract current line. |
496 | 139 | const char *Newline = findNewline(BufferPtr, CommentEnd); |
497 | 139 | StringRef Line(BufferPtr, Newline - BufferPtr); |
498 | | |
499 | | // Look for end command in current line. |
500 | 139 | size_t Pos = Line.find(VerbatimBlockEndCommandName); |
501 | 139 | const char *TextEnd; |
502 | 139 | const char *NextLine; |
503 | 139 | if (Pos == StringRef::npos) { |
504 | | // Current line is completely verbatim. |
505 | 54 | TextEnd = Newline; |
506 | 54 | NextLine = skipNewline(Newline, CommentEnd); |
507 | 85 | } else if (Pos == 0) { |
508 | | // Current line contains just an end command. |
509 | 46 | const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); |
510 | 46 | StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); |
511 | 46 | formTokenWithChars(T, End, tok::verbatim_block_end); |
512 | 46 | T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); |
513 | 46 | State = LS_Normal; |
514 | 46 | return; |
515 | 39 | } else { |
516 | | // There is some text, followed by end command. Extract text first. |
517 | 39 | TextEnd = BufferPtr + Pos; |
518 | 39 | NextLine = TextEnd; |
519 | | // If there is only whitespace before end command, skip whitespace. |
520 | 39 | if (isWhitespace(BufferPtr, TextEnd)) { |
521 | 19 | BufferPtr = TextEnd; |
522 | 19 | goto again; |
523 | 19 | } |
524 | 74 | } |
525 | | |
526 | 74 | StringRef Text(BufferPtr, TextEnd - BufferPtr); |
527 | 74 | formTokenWithChars(T, NextLine, tok::verbatim_block_line); |
528 | 74 | T.setVerbatimBlockText(Text); |
529 | | |
530 | 74 | State = LS_VerbatimBlockBody; |
531 | 74 | } |
532 | | |
533 | 77 | void Lexer::lexVerbatimBlockBody(Token &T) { |
534 | 77 | assert(State == LS_VerbatimBlockBody); |
535 | | |
536 | 77 | if (CommentState == LCS_InsideCComment) |
537 | 50 | skipLineStartingDecorations(); |
538 | | |
539 | 77 | if (BufferPtr == CommentEnd) { |
540 | 4 | formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); |
541 | 4 | T.setVerbatimBlockText(""); |
542 | 4 | return; |
543 | 4 | } |
544 | | |
545 | 73 | lexVerbatimBlockFirstLine(T); |
546 | 73 | } |
547 | | |
548 | | void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
549 | 151 | const CommandInfo *Info) { |
550 | 151 | assert(Info->IsVerbatimLineCommand); |
551 | 151 | formTokenWithChars(T, TextBegin, tok::verbatim_line_name); |
552 | 151 | T.setVerbatimLineID(Info->getID()); |
553 | | |
554 | 151 | State = LS_VerbatimLineText; |
555 | 151 | } |
556 | | |
557 | 147 | void Lexer::lexVerbatimLineText(Token &T) { |
558 | 147 | assert(State == LS_VerbatimLineText); |
559 | | |
560 | | // Extract current line. |
561 | 147 | const char *Newline = findNewline(BufferPtr, CommentEnd); |
562 | 147 | StringRef Text(BufferPtr, Newline - BufferPtr); |
563 | 147 | formTokenWithChars(T, Newline, tok::verbatim_line_text); |
564 | 147 | T.setVerbatimLineText(Text); |
565 | | |
566 | 147 | State = LS_Normal; |
567 | 147 | } |
568 | | |
569 | 100 | void Lexer::lexHTMLCharacterReference(Token &T) { |
570 | 100 | const char *TokenPtr = BufferPtr; |
571 | 100 | assert(*TokenPtr == '&'); |
572 | 100 | TokenPtr++; |
573 | 100 | if (TokenPtr == CommentEnd) { |
574 | 1 | formTextToken(T, TokenPtr); |
575 | 1 | return; |
576 | 1 | } |
577 | 99 | const char *NamePtr; |
578 | 99 | bool isNamed = false; |
579 | 99 | bool isDecimal = false; |
580 | 99 | char C = *TokenPtr; |
581 | 99 | if (isHTMLNamedCharacterReferenceCharacter(C)) { |
582 | 54 | NamePtr = TokenPtr; |
583 | 54 | TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); |
584 | 54 | isNamed = true; |
585 | 45 | } else if (C == '#') { |
586 | 44 | TokenPtr++; |
587 | 44 | if (TokenPtr == CommentEnd) { |
588 | 1 | formTextToken(T, TokenPtr); |
589 | 1 | return; |
590 | 1 | } |
591 | 43 | C = *TokenPtr; |
592 | 43 | if (isHTMLDecimalCharacterReferenceCharacter(C)) { |
593 | 19 | NamePtr = TokenPtr; |
594 | 19 | TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); |
595 | 19 | isDecimal = true; |
596 | 24 | } else if (C == 'x' || C == 'X'7 ) { |
597 | 23 | TokenPtr++; |
598 | 23 | NamePtr = TokenPtr; |
599 | 23 | TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); |
600 | 1 | } else { |
601 | 1 | formTextToken(T, TokenPtr); |
602 | 1 | return; |
603 | 1 | } |
604 | 1 | } else { |
605 | 1 | formTextToken(T, TokenPtr); |
606 | 1 | return; |
607 | 1 | } |
608 | 96 | if (NamePtr == TokenPtr || TokenPtr == CommentEnd94 || |
609 | 91 | *TokenPtr != ';') { |
610 | 8 | formTextToken(T, TokenPtr); |
611 | 8 | return; |
612 | 8 | } |
613 | 88 | StringRef Name(NamePtr, TokenPtr - NamePtr); |
614 | 88 | TokenPtr++; // Skip semicolon. |
615 | 88 | StringRef Resolved; |
616 | 88 | if (isNamed) |
617 | 52 | Resolved = resolveHTMLNamedCharacterReference(Name); |
618 | 36 | else if (isDecimal) |
619 | 17 | Resolved = resolveHTMLDecimalCharacterReference(Name); |
620 | 19 | else |
621 | 19 | Resolved = resolveHTMLHexCharacterReference(Name); |
622 | | |
623 | 88 | if (Resolved.empty()) { |
624 | 0 | formTextToken(T, TokenPtr); |
625 | 0 | return; |
626 | 0 | } |
627 | 88 | formTokenWithChars(T, TokenPtr, tok::text); |
628 | 88 | T.setText(Resolved); |
629 | 88 | } |
630 | | |
631 | 203 | void Lexer::setupAndLexHTMLStartTag(Token &T) { |
632 | 203 | assert(BufferPtr[0] == '<' && |
633 | 203 | isHTMLIdentifierStartingCharacter(BufferPtr[1])); |
634 | 203 | const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); |
635 | 203 | StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); |
636 | 203 | if (!isHTMLTagName(Name)) { |
637 | 8 | formTextToken(T, TagNameEnd); |
638 | 8 | return; |
639 | 8 | } |
640 | | |
641 | 195 | formTokenWithChars(T, TagNameEnd, tok::html_start_tag); |
642 | 195 | T.setHTMLTagStartName(Name); |
643 | | |
644 | 195 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
645 | | |
646 | 195 | const char C = *BufferPtr; |
647 | 195 | if (BufferPtr != CommentEnd && |
648 | 192 | (C == '>' || C == '/'101 || isHTMLIdentifierStartingCharacter(C)84 )) |
649 | 182 | State = LS_HTMLStartTag; |
650 | 195 | } |
651 | | |
652 | 356 | void Lexer::lexHTMLStartTag(Token &T) { |
653 | 356 | assert(State == LS_HTMLStartTag); |
654 | | |
655 | 356 | const char *TokenPtr = BufferPtr; |
656 | 356 | char C = *TokenPtr; |
657 | 356 | if (isHTMLIdentifierCharacter(C)) { |
658 | 86 | TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); |
659 | 86 | StringRef Ident(BufferPtr, TokenPtr - BufferPtr); |
660 | 86 | formTokenWithChars(T, TokenPtr, tok::html_ident); |
661 | 86 | T.setHTMLIdent(Ident); |
662 | 270 | } else { |
663 | 270 | switch (C) { |
664 | 72 | case '=': |
665 | 72 | TokenPtr++; |
666 | 72 | formTokenWithChars(T, TokenPtr, tok::html_equals); |
667 | 72 | break; |
668 | 43 | case '\"': |
669 | 48 | case '\'': { |
670 | 48 | const char *OpenQuote = TokenPtr; |
671 | 48 | TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); |
672 | 48 | const char *ClosingQuote = TokenPtr; |
673 | 48 | if (TokenPtr != CommentEnd) // Skip closing quote. |
674 | 42 | TokenPtr++; |
675 | 48 | formTokenWithChars(T, TokenPtr, tok::html_quoted_string); |
676 | 48 | T.setHTMLQuotedString(StringRef(OpenQuote + 1, |
677 | 48 | ClosingQuote - (OpenQuote + 1))); |
678 | 48 | break; |
679 | 43 | } |
680 | 133 | case '>': |
681 | 133 | TokenPtr++; |
682 | 133 | formTokenWithChars(T, TokenPtr, tok::html_greater); |
683 | 133 | State = LS_Normal; |
684 | 133 | return; |
685 | 17 | case '/': |
686 | 17 | TokenPtr++; |
687 | 17 | if (TokenPtr != CommentEnd && *TokenPtr == '>') { |
688 | 15 | TokenPtr++; |
689 | 15 | formTokenWithChars(T, TokenPtr, tok::html_slash_greater); |
690 | 15 | } else |
691 | 2 | formTextToken(T, TokenPtr); |
692 | | |
693 | 17 | State = LS_Normal; |
694 | 17 | return; |
695 | 206 | } |
696 | 206 | } |
697 | | |
698 | | // Now look ahead and return to normal state if we don't see any HTML tokens |
699 | | // ahead. |
700 | 206 | BufferPtr = skipWhitespace(BufferPtr, CommentEnd); |
701 | 206 | if (BufferPtr == CommentEnd) { |
702 | 23 | State = LS_Normal; |
703 | 23 | return; |
704 | 23 | } |
705 | | |
706 | 183 | C = *BufferPtr; |
707 | 183 | if (!isHTMLIdentifierStartingCharacter(C) && |
708 | 171 | C != '=' && C != '\"'99 && C != '\''56 && C != '>'51 ) { |
709 | 9 | State = LS_Normal; |
710 | 9 | return; |
711 | 9 | } |
712 | 183 | } |
713 | | |
714 | 99 | void Lexer::setupAndLexHTMLEndTag(Token &T) { |
715 | 99 | assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); |
716 | | |
717 | 99 | const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); |
718 | 99 | const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); |
719 | 99 | StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); |
720 | 99 | if (!isHTMLTagName(Name)) { |
721 | 3 | formTextToken(T, TagNameEnd); |
722 | 3 | return; |
723 | 3 | } |
724 | | |
725 | 96 | const char *End = skipWhitespace(TagNameEnd, CommentEnd); |
726 | | |
727 | 96 | formTokenWithChars(T, End, tok::html_end_tag); |
728 | 96 | T.setHTMLTagEndName(Name); |
729 | | |
730 | 96 | if (BufferPtr != CommentEnd && *BufferPtr == '>'94 ) |
731 | 94 | State = LS_HTMLEndTag; |
732 | 96 | } |
733 | | |
734 | 94 | void Lexer::lexHTMLEndTag(Token &T) { |
735 | 94 | assert(BufferPtr != CommentEnd && *BufferPtr == '>'); |
736 | | |
737 | 94 | formTokenWithChars(T, BufferPtr + 1, tok::html_greater); |
738 | 94 | State = LS_Normal; |
739 | 94 | } |
740 | | |
741 | | Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
742 | | const CommandTraits &Traits, SourceLocation FileLoc, |
743 | | const char *BufferStart, const char *BufferEnd, bool ParseCommands) |
744 | | : Allocator(Allocator), Diags(Diags), Traits(Traits), |
745 | | BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart), |
746 | | FileLoc(FileLoc), ParseCommands(ParseCommands), |
747 | 2.69k | CommentState(LCS_BeforeComment), State(LS_Normal) {} |
748 | | |
749 | 20.6k | void Lexer::lex(Token &T) { |
750 | 27.6k | again: |
751 | 27.6k | switch (CommentState) { |
752 | 7.69k | case LCS_BeforeComment: |
753 | 7.69k | if (BufferPtr == BufferEnd) { |
754 | 3.79k | formTokenWithChars(T, BufferPtr, tok::eof); |
755 | 3.79k | return; |
756 | 3.79k | } |
757 | | |
758 | 3.89k | assert(*BufferPtr == '/'); |
759 | 3.89k | BufferPtr++; // Skip first slash. |
760 | 3.89k | switch(*BufferPtr) { |
761 | 3.19k | case '/': { // BCPL comment. |
762 | 3.19k | BufferPtr++; // Skip second slash. |
763 | | |
764 | 3.19k | if (BufferPtr != BufferEnd) { |
765 | | // Skip Doxygen magic marker, if it is present. |
766 | | // It might be missing because of a typo //< or /*<, or because we |
767 | | // merged this non-Doxygen comment into a bunch of Doxygen comments |
768 | | // around it: /** ... */ /* ... */ /** ... */ |
769 | 3.18k | const char C = *BufferPtr; |
770 | 3.18k | if (C == '/' || C == '!'302 ) |
771 | 2.92k | BufferPtr++; |
772 | 3.18k | } |
773 | | |
774 | | // Skip less-than symbol that marks trailing comments. |
775 | | // Skip it even if the comment is not a Doxygen one, because //< and /*< |
776 | | // are frequent typos. |
777 | 3.19k | if (BufferPtr != BufferEnd && *BufferPtr == '<'3.17k ) |
778 | 136 | BufferPtr++; |
779 | | |
780 | 3.19k | CommentState = LCS_InsideBCPLComment; |
781 | 3.19k | if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine3.16k ) |
782 | 3.15k | State = LS_Normal; |
783 | 3.19k | CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); |
784 | 3.19k | goto again; |
785 | 0 | } |
786 | 704 | case '*': { // C comment. |
787 | 704 | BufferPtr++; // Skip star. |
788 | | |
789 | | // Skip Doxygen magic marker. |
790 | 704 | const char C = *BufferPtr; |
791 | 704 | if ((C == '*' && *(BufferPtr + 1) != '/'557 ) || C == '!'148 ) |
792 | 679 | BufferPtr++; |
793 | | |
794 | | // Skip less-than symbol that marks trailing comments. |
795 | 704 | if (BufferPtr != BufferEnd && *BufferPtr == '<') |
796 | 40 | BufferPtr++; |
797 | | |
798 | 704 | CommentState = LCS_InsideCComment; |
799 | 704 | State = LS_Normal; |
800 | 704 | CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); |
801 | 704 | goto again; |
802 | 0 | } |
803 | 0 | default: |
804 | 0 | llvm_unreachable("second character of comment should be '/' or '*'"); |
805 | 0 | } |
806 | | |
807 | 3.82k | case LCS_BetweenComments: { |
808 | | // Consecutive comments are extracted only if there is only whitespace |
809 | | // between them. So we can search for the start of the next comment. |
810 | 3.82k | const char *EndWhitespace = BufferPtr; |
811 | 5.33k | while(EndWhitespace != BufferEnd && *EndWhitespace != '/'2.72k ) |
812 | 1.51k | EndWhitespace++; |
813 | | |
814 | | // Turn any whitespace between comments (and there is only whitespace |
815 | | // between them -- guaranteed by comment extraction) into a newline. We |
816 | | // have two newlines between C comments in total (first one was synthesized |
817 | | // after a comment). |
818 | 3.82k | formTokenWithChars(T, EndWhitespace, tok::newline); |
819 | | |
820 | 3.82k | CommentState = LCS_BeforeComment; |
821 | 3.82k | break; |
822 | 0 | } |
823 | |
|
824 | 11.0k | case LCS_InsideBCPLComment: |
825 | 16.1k | case LCS_InsideCComment: |
826 | 16.1k | if (BufferPtr != CommentEnd) { |
827 | 12.3k | lexCommentText(T); |
828 | 12.3k | break; |
829 | 3.82k | } else { |
830 | | // Skip C comment closing sequence. |
831 | 3.82k | if (CommentState == LCS_InsideCComment) { |
832 | 701 | assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); |
833 | 701 | BufferPtr += 2; |
834 | 701 | assert(BufferPtr <= BufferEnd); |
835 | | |
836 | | // Synthenize newline just after the C comment, regardless if there is |
837 | | // actually a newline. |
838 | 701 | formTokenWithChars(T, BufferPtr, tok::newline); |
839 | | |
840 | 701 | CommentState = LCS_BetweenComments; |
841 | 701 | break; |
842 | 3.12k | } else { |
843 | | // Don't synthesized a newline after BCPL comment. |
844 | 3.12k | CommentState = LCS_BetweenComments; |
845 | 3.12k | goto again; |
846 | 3.12k | } |
847 | 3.82k | } |
848 | 27.6k | } |
849 | 27.6k | } |
850 | | |
851 | | StringRef Lexer::getSpelling(const Token &Tok, |
852 | 28 | const SourceManager &SourceMgr) const { |
853 | 28 | SourceLocation Loc = Tok.getLocation(); |
854 | 28 | std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); |
855 | | |
856 | 28 | bool InvalidTemp = false; |
857 | 28 | StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); |
858 | 28 | if (InvalidTemp) |
859 | 0 | return StringRef(); |
860 | | |
861 | 28 | const char *Begin = File.data() + LocInfo.second; |
862 | 28 | return StringRef(Begin, Tok.getLength()); |
863 | 28 | } |
864 | | |
865 | | } // end namespace comments |
866 | | } // end namespace clang |