/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/include/clang/AST/CommentLexer.h
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file defines lexer for structured comments and supporting token class. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #ifndef LLVM_CLANG_AST_COMMENTLEXER_H |
14 | | #define LLVM_CLANG_AST_COMMENTLEXER_H |
15 | | |
16 | | #include "clang/Basic/Diagnostic.h" |
17 | | #include "clang/Basic/SourceManager.h" |
18 | | #include "llvm/ADT/SmallString.h" |
19 | | #include "llvm/ADT/StringRef.h" |
20 | | #include "llvm/Support/Allocator.h" |
21 | | #include "llvm/Support/raw_ostream.h" |
22 | | |
23 | | namespace clang { |
24 | | namespace comments { |
25 | | |
26 | | class Lexer; |
27 | | class TextTokenRetokenizer; |
28 | | struct CommandInfo; |
29 | | class CommandTraits; |
30 | | |
31 | | namespace tok { |
32 | | enum TokenKind { |
33 | | eof, |
34 | | newline, |
35 | | text, |
36 | | unknown_command, // Command that does not have an ID. |
37 | | backslash_command, // Command with an ID, that used backslash marker. |
38 | | at_command, // Command with an ID, that used 'at' marker. |
39 | | verbatim_block_begin, |
40 | | verbatim_block_line, |
41 | | verbatim_block_end, |
42 | | verbatim_line_name, |
43 | | verbatim_line_text, |
44 | | html_start_tag, // <tag |
45 | | html_ident, // attr |
46 | | html_equals, // = |
47 | | html_quoted_string, // "blah\"blah" or 'blah\'blah' |
48 | | html_greater, // > |
49 | | html_slash_greater, // /> |
50 | | html_end_tag // </tag |
51 | | }; |
52 | | } // end namespace tok |
53 | | |
54 | | /// Comment token. |
55 | | class Token { |
56 | | friend class Lexer; |
57 | | friend class TextTokenRetokenizer; |
58 | | |
59 | | /// The location of the token. |
60 | | SourceLocation Loc; |
61 | | |
62 | | /// The actual kind of the token. |
63 | | tok::TokenKind Kind; |
64 | | |
65 | | /// Integer value associated with a token. |
66 | | /// |
67 | | /// If the token is a known command, contains command ID and TextPtr is |
68 | | /// unused (command spelling can be found with CommandTraits). Otherwise, |
69 | | /// contains the length of the string that starts at TextPtr. |
70 | | unsigned IntVal; |
71 | | |
72 | | /// Length of the token spelling in comment. Can be 0 for synthenized |
73 | | /// tokens. |
74 | | unsigned Length; |
75 | | |
76 | | /// Contains text value associated with a token. |
77 | | const char *TextPtr; |
78 | | |
79 | | public: |
80 | 64.9k | SourceLocation getLocation() const LLVM_READONLY { return Loc; } |
81 | 95.4k | void setLocation(SourceLocation SL) { Loc = SL; } |
82 | | |
83 | 48.2k | SourceLocation getEndLocation() const LLVM_READONLY { |
84 | 48.2k | if (Length == 0 || Length == 1) |
85 | 9.70k | return Loc; |
86 | 38.5k | return Loc.getLocWithOffset(Length - 1); |
87 | 48.2k | } |
88 | | |
89 | 85.3k | tok::TokenKind getKind() const LLVM_READONLY { return Kind; } |
90 | 95.4k | void setKind(tok::TokenKind K) { Kind = K; } |
91 | | |
92 | 310k | bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } |
93 | 36.9k | bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } |
94 | | |
95 | 42 | unsigned getLength() const LLVM_READONLY { return Length; } |
96 | 95.4k | void setLength(unsigned L) { Length = L; } |
97 | | |
98 | 66.0k | StringRef getText() const LLVM_READONLY { |
99 | 66.0k | assert(is(tok::text)); |
100 | 0 | return StringRef(TextPtr, IntVal); |
101 | 66.0k | } |
102 | | |
103 | 45.7k | void setText(StringRef Text) { |
104 | 45.7k | assert(is(tok::text)); |
105 | 0 | TextPtr = Text.data(); |
106 | 45.7k | IntVal = Text.size(); |
107 | 45.7k | } |
108 | | |
109 | 154 | StringRef getUnknownCommandName() const LLVM_READONLY { |
110 | 154 | assert(is(tok::unknown_command)); |
111 | 0 | return StringRef(TextPtr, IntVal); |
112 | 154 | } |
113 | | |
114 | 291 | void setUnknownCommandName(StringRef Name) { |
115 | 291 | assert(is(tok::unknown_command)); |
116 | 0 | TextPtr = Name.data(); |
117 | 291 | IntVal = Name.size(); |
118 | 291 | } |
119 | | |
120 | 35.2k | unsigned getCommandID() const LLVM_READONLY { |
121 | 35.2k | assert(is(tok::backslash_command) || is(tok::at_command)); |
122 | 0 | return IntVal; |
123 | 35.2k | } |
124 | | |
125 | 9.72k | void setCommandID(unsigned ID) { |
126 | 9.72k | assert(is(tok::backslash_command) || is(tok::at_command)); |
127 | 0 | IntVal = ID; |
128 | 9.72k | } |
129 | | |
130 | 227 | unsigned getVerbatimBlockID() const LLVM_READONLY { |
131 | 227 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
132 | 0 | return IntVal; |
133 | 227 | } |
134 | | |
135 | 239 | void setVerbatimBlockID(unsigned ID) { |
136 | 239 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
137 | 0 | IntVal = ID; |
138 | 239 | } |
139 | | |
140 | 907 | StringRef getVerbatimBlockText() const LLVM_READONLY { |
141 | 907 | assert(is(tok::verbatim_block_line)); |
142 | 0 | return StringRef(TextPtr, IntVal); |
143 | 907 | } |
144 | | |
145 | 918 | void setVerbatimBlockText(StringRef Text) { |
146 | 918 | assert(is(tok::verbatim_block_line)); |
147 | 0 | TextPtr = Text.data(); |
148 | 918 | IntVal = Text.size(); |
149 | 918 | } |
150 | | |
151 | 116 | unsigned getVerbatimLineID() const LLVM_READONLY { |
152 | 116 | assert(is(tok::verbatim_line_name)); |
153 | 0 | return IntVal; |
154 | 116 | } |
155 | | |
156 | 161 | void setVerbatimLineID(unsigned ID) { |
157 | 161 | assert(is(tok::verbatim_line_name)); |
158 | 0 | IntVal = ID; |
159 | 161 | } |
160 | | |
161 | 112 | StringRef getVerbatimLineText() const LLVM_READONLY { |
162 | 112 | assert(is(tok::verbatim_line_text)); |
163 | 0 | return StringRef(TextPtr, IntVal); |
164 | 112 | } |
165 | | |
166 | 157 | void setVerbatimLineText(StringRef Text) { |
167 | 157 | assert(is(tok::verbatim_line_text)); |
168 | 0 | TextPtr = Text.data(); |
169 | 157 | IntVal = Text.size(); |
170 | 157 | } |
171 | | |
172 | 172 | StringRef getHTMLTagStartName() const LLVM_READONLY { |
173 | 172 | assert(is(tok::html_start_tag)); |
174 | 0 | return StringRef(TextPtr, IntVal); |
175 | 172 | } |
176 | | |
177 | 219 | void setHTMLTagStartName(StringRef Name) { |
178 | 219 | assert(is(tok::html_start_tag)); |
179 | 0 | TextPtr = Name.data(); |
180 | 219 | IntVal = Name.size(); |
181 | 219 | } |
182 | | |
183 | 65 | StringRef getHTMLIdent() const LLVM_READONLY { |
184 | 65 | assert(is(tok::html_ident)); |
185 | 0 | return StringRef(TextPtr, IntVal); |
186 | 65 | } |
187 | | |
188 | 86 | void setHTMLIdent(StringRef Name) { |
189 | 86 | assert(is(tok::html_ident)); |
190 | 0 | TextPtr = Name.data(); |
191 | 86 | IntVal = Name.size(); |
192 | 86 | } |
193 | | |
194 | 31 | StringRef getHTMLQuotedString() const LLVM_READONLY { |
195 | 31 | assert(is(tok::html_quoted_string)); |
196 | 0 | return StringRef(TextPtr, IntVal); |
197 | 31 | } |
198 | | |
199 | 48 | void setHTMLQuotedString(StringRef Str) { |
200 | 48 | assert(is(tok::html_quoted_string)); |
201 | 0 | TextPtr = Str.data(); |
202 | 48 | IntVal = Str.size(); |
203 | 48 | } |
204 | | |
205 | 89 | StringRef getHTMLTagEndName() const LLVM_READONLY { |
206 | 89 | assert(is(tok::html_end_tag)); |
207 | 0 | return StringRef(TextPtr, IntVal); |
208 | 89 | } |
209 | | |
210 | 120 | void setHTMLTagEndName(StringRef Name) { |
211 | 120 | assert(is(tok::html_end_tag)); |
212 | 0 | TextPtr = Name.data(); |
213 | 120 | IntVal = Name.size(); |
214 | 120 | } |
215 | | |
216 | | void dump(const Lexer &L, const SourceManager &SM) const; |
217 | | }; |
218 | | |
219 | | /// Comment lexer. |
220 | | class Lexer { |
221 | | private: |
222 | | Lexer(const Lexer &) = delete; |
223 | | void operator=(const Lexer &) = delete; |
224 | | |
225 | | /// Allocator for strings that are semantic values of tokens and have to be |
226 | | /// computed (for example, resolved decimal character references). |
227 | | llvm::BumpPtrAllocator &Allocator; |
228 | | |
229 | | DiagnosticsEngine &Diags; |
230 | | |
231 | | const CommandTraits &Traits; |
232 | | |
233 | | const char *const BufferStart; |
234 | | const char *const BufferEnd; |
235 | | |
236 | | const char *BufferPtr; |
237 | | |
238 | | /// One past end pointer for the current comment. For BCPL comments points |
239 | | /// to newline or BufferEnd, for C comments points to star in '*/'. |
240 | | const char *CommentEnd; |
241 | | |
242 | | SourceLocation FileLoc; |
243 | | |
244 | | /// If true, the commands, html tags, etc will be parsed and reported as |
245 | | /// separate tokens inside the comment body. If false, the comment text will |
246 | | /// be parsed into text and newline tokens. |
247 | | bool ParseCommands; |
248 | | |
249 | | enum LexerCommentState : uint8_t { |
250 | | LCS_BeforeComment, |
251 | | LCS_InsideBCPLComment, |
252 | | LCS_InsideCComment, |
253 | | LCS_BetweenComments |
254 | | }; |
255 | | |
256 | | /// Low-level lexer state, track if we are inside or outside of comment. |
257 | | LexerCommentState CommentState; |
258 | | |
259 | | enum LexerState : uint8_t { |
260 | | /// Lexing normal comment text |
261 | | LS_Normal, |
262 | | |
263 | | /// Finished lexing verbatim block beginning command, will lex first body |
264 | | /// line. |
265 | | LS_VerbatimBlockFirstLine, |
266 | | |
267 | | /// Lexing verbatim block body line-by-line, skipping line-starting |
268 | | /// decorations. |
269 | | LS_VerbatimBlockBody, |
270 | | |
271 | | /// Finished lexing verbatim line beginning command, will lex text (one |
272 | | /// line). |
273 | | LS_VerbatimLineText, |
274 | | |
275 | | /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. |
276 | | LS_HTMLStartTag, |
277 | | |
278 | | /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. |
279 | | LS_HTMLEndTag |
280 | | }; |
281 | | |
282 | | /// Current lexing mode. |
283 | | LexerState State; |
284 | | |
285 | | /// If State is LS_VerbatimBlock, contains the name of verbatim end |
286 | | /// command, including command marker. |
287 | | SmallString<16> VerbatimBlockEndCommandName; |
288 | | |
289 | | /// Given a character reference name (e.g., "lt"), return the character that |
290 | | /// it stands for (e.g., "<"). |
291 | | StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; |
292 | | |
293 | | /// Given a Unicode codepoint as base-10 integer, return the character. |
294 | | StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; |
295 | | |
296 | | /// Given a Unicode codepoint as base-16 integer, return the character. |
297 | | StringRef resolveHTMLHexCharacterReference(StringRef Name) const; |
298 | | |
299 | | void formTokenWithChars(Token &Result, const char *TokEnd, |
300 | | tok::TokenKind Kind); |
301 | | |
302 | 39.9k | void formTextToken(Token &Result, const char *TokEnd) { |
303 | 39.9k | StringRef Text(BufferPtr, TokEnd - BufferPtr); |
304 | 39.9k | formTokenWithChars(Result, TokEnd, tok::text); |
305 | 39.9k | Result.setText(Text); |
306 | 39.9k | } |
307 | | |
308 | 89.9k | SourceLocation getSourceLocation(const char *Loc) const { |
309 | 89.9k | assert(Loc >= BufferStart && Loc <= BufferEnd && |
310 | 89.9k | "Location out of range for this buffer!"); |
311 | | |
312 | 0 | const unsigned CharNo = Loc - BufferStart; |
313 | 89.9k | return FileLoc.getLocWithOffset(CharNo); |
314 | 89.9k | } |
315 | | |
316 | 314 | DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { |
317 | 314 | return Diags.Report(Loc, DiagID); |
318 | 314 | } |
319 | | |
320 | | /// Eat string matching regexp \code \s*\* \endcode. |
321 | | void skipLineStartingDecorations(); |
322 | | |
323 | | /// Skip over pure text. |
324 | | const char *skipTextToken(); |
325 | | |
326 | | /// Lex comment text, including commands if ParseCommands is set to true. |
327 | | void lexCommentText(Token &T); |
328 | | |
329 | | void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, |
330 | | const CommandInfo *Info); |
331 | | |
332 | | void lexVerbatimBlockFirstLine(Token &T); |
333 | | |
334 | | void lexVerbatimBlockBody(Token &T); |
335 | | |
336 | | void setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
337 | | const CommandInfo *Info); |
338 | | |
339 | | void lexVerbatimLineText(Token &T); |
340 | | |
341 | | void lexHTMLCharacterReference(Token &T); |
342 | | |
343 | | void setupAndLexHTMLStartTag(Token &T); |
344 | | |
345 | | void lexHTMLStartTag(Token &T); |
346 | | |
347 | | void setupAndLexHTMLEndTag(Token &T); |
348 | | |
349 | | void lexHTMLEndTag(Token &T); |
350 | | |
351 | | public: |
352 | | Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
353 | | const CommandTraits &Traits, SourceLocation FileLoc, |
354 | | const char *BufferStart, const char *BufferEnd, |
355 | | bool ParseCommands = true); |
356 | | |
357 | | void lex(Token &T); |
358 | | |
359 | | StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; |
360 | | }; |
361 | | |
362 | | } // end namespace comments |
363 | | } // end namespace clang |
364 | | |
365 | | #endif |
366 | | |