/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/TokenConcatenation.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file implements the TokenConcatenation class. |
10 | | // |
11 | | //===----------------------------------------------------------------------===// |
12 | | |
13 | | #include "clang/Lex/TokenConcatenation.h" |
14 | | #include "clang/Basic/CharInfo.h" |
15 | | #include "clang/Lex/Preprocessor.h" |
16 | | #include "llvm/Support/ErrorHandling.h" |
17 | | using namespace clang; |
18 | | |
19 | | |
20 | | /// IsStringPrefix - Return true if Str is a string prefix. |
21 | | /// 'L', 'u', 'U', or 'u8'. Including raw versions. |
22 | 2 | static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) { |
23 | | |
24 | 2 | if (Str[0] == 'L' || |
25 | 2 | (0 CPlusPlus110 && (0 Str[0] == 'u'0 || Str[0] == 'U'0 || Str[0] == 'R'0 ))) { |
26 | | |
27 | 2 | if (Str.size() == 1) |
28 | 2 | return true; // "L", "u", "U", and "R" |
29 | | |
30 | | // Check for raw flavors. Need to make sure the first character wasn't |
31 | | // already R. Need CPlusPlus11 check for "LR". |
32 | 0 | if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11) |
33 | 0 | return true; // "LR", "uR", "UR" |
34 | | |
35 | | // Check for "u8" and "u8R" |
36 | 0 | if (Str[0] == 'u' && Str[1] == '8') { |
37 | 0 | if (Str.size() == 2) return true; // "u8" |
38 | 0 | if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" |
39 | 0 | } |
40 | 0 | } |
41 | | |
42 | 0 | return false; |
43 | 2 | } |
44 | | |
45 | | /// IsIdentifierStringPrefix - Return true if the spelling of the token |
46 | | /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. |
47 | 2 | bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { |
48 | 2 | const LangOptions &LangOpts = PP.getLangOpts(); |
49 | | |
50 | 2 | if (!Tok.needsCleaning()) { |
51 | 2 | if (Tok.getLength() < 1 || Tok.getLength() > 3) |
52 | 0 | return false; |
53 | 2 | SourceManager &SM = PP.getSourceManager(); |
54 | 2 | const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); |
55 | 2 | return IsStringPrefix(StringRef(Ptr, Tok.getLength()), |
56 | 2 | LangOpts.CPlusPlus11); |
57 | 2 | } |
58 | | |
59 | 0 | if (Tok.getLength() < 256) { |
60 | 0 | char Buffer[256]; |
61 | 0 | const char *TokPtr = Buffer; |
62 | 0 | unsigned length = PP.getSpelling(Tok, TokPtr); |
63 | 0 | return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11); |
64 | 0 | } |
65 | | |
66 | 0 | return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11); |
67 | 0 | } |
68 | | |
69 | 731 | TokenConcatenation::TokenConcatenation(const Preprocessor &pp) : PP(pp) { |
70 | 731 | memset(TokenInfo, 0, sizeof(TokenInfo)); |
71 | | |
72 | | // These tokens have custom code in AvoidConcat. |
73 | 731 | TokenInfo[tok::identifier ] |= aci_custom; |
74 | 731 | TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; |
75 | 731 | TokenInfo[tok::period ] |= aci_custom_firstchar; |
76 | 731 | TokenInfo[tok::amp ] |= aci_custom_firstchar; |
77 | 731 | TokenInfo[tok::plus ] |= aci_custom_firstchar; |
78 | 731 | TokenInfo[tok::minus ] |= aci_custom_firstchar; |
79 | 731 | TokenInfo[tok::slash ] |= aci_custom_firstchar; |
80 | 731 | TokenInfo[tok::less ] |= aci_custom_firstchar; |
81 | 731 | TokenInfo[tok::greater ] |= aci_custom_firstchar; |
82 | 731 | TokenInfo[tok::pipe ] |= aci_custom_firstchar; |
83 | 731 | TokenInfo[tok::percent ] |= aci_custom_firstchar; |
84 | 731 | TokenInfo[tok::colon ] |= aci_custom_firstchar; |
85 | 731 | TokenInfo[tok::hash ] |= aci_custom_firstchar; |
86 | 731 | TokenInfo[tok::arrow ] |= aci_custom_firstchar; |
87 | | |
88 | | // These tokens have custom code in C++11 mode. |
89 | 731 | if (PP.getLangOpts().CPlusPlus11) { |
90 | 219 | TokenInfo[tok::string_literal ] |= aci_custom; |
91 | 219 | TokenInfo[tok::wide_string_literal ] |= aci_custom; |
92 | 219 | TokenInfo[tok::utf8_string_literal ] |= aci_custom; |
93 | 219 | TokenInfo[tok::utf16_string_literal] |= aci_custom; |
94 | 219 | TokenInfo[tok::utf32_string_literal] |= aci_custom; |
95 | 219 | TokenInfo[tok::char_constant ] |= aci_custom; |
96 | 219 | TokenInfo[tok::wide_char_constant ] |= aci_custom; |
97 | 219 | TokenInfo[tok::utf16_char_constant ] |= aci_custom; |
98 | 219 | TokenInfo[tok::utf32_char_constant ] |= aci_custom; |
99 | 219 | } |
100 | | |
101 | | // These tokens have custom code in C++17 mode. |
102 | 731 | if (PP.getLangOpts().CPlusPlus17) |
103 | 153 | TokenInfo[tok::utf8_char_constant] |= aci_custom; |
104 | | |
105 | | // These tokens have custom code in C++2a mode. |
106 | 731 | if (PP.getLangOpts().CPlusPlus20) |
107 | 11 | TokenInfo[tok::lessequal ] |= aci_custom_firstchar; |
108 | | |
109 | | // These tokens change behavior if followed by an '='. |
110 | 731 | TokenInfo[tok::amp ] |= aci_avoid_equal; // &= |
111 | 731 | TokenInfo[tok::plus ] |= aci_avoid_equal; // += |
112 | 731 | TokenInfo[tok::minus ] |= aci_avoid_equal; // -= |
113 | 731 | TokenInfo[tok::slash ] |= aci_avoid_equal; // /= |
114 | 731 | TokenInfo[tok::less ] |= aci_avoid_equal; // <= |
115 | 731 | TokenInfo[tok::greater ] |= aci_avoid_equal; // >= |
116 | 731 | TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= |
117 | 731 | TokenInfo[tok::percent ] |= aci_avoid_equal; // %= |
118 | 731 | TokenInfo[tok::star ] |= aci_avoid_equal; // *= |
119 | 731 | TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != |
120 | 731 | TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= |
121 | 731 | TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= |
122 | 731 | TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= |
123 | 731 | TokenInfo[tok::equal ] |= aci_avoid_equal; // == |
124 | 731 | } |
125 | | |
126 | | /// GetFirstChar - Get the first character of the token \arg Tok, |
127 | | /// avoiding calls to getSpelling where possible. |
128 | 1.50k | static char GetFirstChar(const Preprocessor &PP, const Token &Tok) { |
129 | 1.50k | if (IdentifierInfo *II = Tok.getIdentifierInfo()) { |
130 | | // Avoid spelling identifiers, the most common form of token. |
131 | 26 | return II->getNameStart()[0]; |
132 | 1.48k | } else if (!Tok.needsCleaning()) { |
133 | 1.48k | if (Tok.isLiteral() && Tok.getLiteralData()323 ) { |
134 | 323 | return *Tok.getLiteralData(); |
135 | 1.15k | } else { |
136 | 1.15k | SourceManager &SM = PP.getSourceManager(); |
137 | 1.15k | return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); |
138 | 1.15k | } |
139 | 1.48k | } else if (0 Tok.getLength() < 2560 ) { |
140 | 0 | char Buffer[256]; |
141 | 0 | const char *TokPtr = Buffer; |
142 | 0 | PP.getSpelling(Tok, TokPtr); |
143 | 0 | return TokPtr[0]; |
144 | 0 | } else { |
145 | 0 | return PP.getSpelling(Tok)[0]; |
146 | 0 | } |
147 | 1.50k | } |
148 | | |
149 | | /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause |
150 | | /// the two individual tokens to be lexed as a single token, return true |
151 | | /// (which causes a space to be printed between them). This allows the output |
152 | | /// of -E mode to be lexed to the same token stream as lexing the input |
153 | | /// directly would. |
154 | | /// |
155 | | /// This code must conservatively return true if it doesn't want to be 100% |
156 | | /// accurate. This will cause the output to include extra space characters, |
157 | | /// but the resulting output won't have incorrect concatenations going on. |
158 | | /// Examples include "..", which we print with a space between, because we |
159 | | /// don't want to track enough to tell "x.." from "...". |
160 | | bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, |
161 | | const Token &PrevTok, |
162 | 183k | const Token &Tok) const { |
163 | | // Conservatively assume that every annotation token that has a printable |
164 | | // form requires whitespace. |
165 | 183k | if (PrevTok.isAnnotation()) |
166 | 2 | return true; |
167 | | |
168 | | // First, check to see if the tokens were directly adjacent in the original |
169 | | // source. If they were, it must be okay to stick them together: if there |
170 | | // were an issue, the tokens would have been lexed differently. |
171 | 183k | SourceManager &SM = PP.getSourceManager(); |
172 | 183k | SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation()); |
173 | 183k | SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation()); |
174 | 183k | if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc) |
175 | 179k | return false; |
176 | | |
177 | 3.97k | tok::TokenKind PrevKind = PrevTok.getKind(); |
178 | 3.97k | if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo()) |
179 | 636 | PrevKind = tok::identifier; // Language keyword or named operator. |
180 | | |
181 | | // Look up information on when we should avoid concatenation with prevtok. |
182 | 3.97k | unsigned ConcatInfo = TokenInfo[PrevKind]; |
183 | | |
184 | | // If prevtok never causes a problem for anything after it, return quickly. |
185 | 3.97k | if (ConcatInfo == 0) return false1.79k ; |
186 | | |
187 | 2.18k | if (ConcatInfo & aci_avoid_equal) { |
188 | | // If the next token is '=' or '==', avoid concatenation. |
189 | 521 | if (Tok.isOneOf(tok::equal, tok::equalequal)) |
190 | 7 | return true; |
191 | 514 | ConcatInfo &= ~aci_avoid_equal; |
192 | 514 | } |
193 | 2.17k | if (Tok.isAnnotation()) { |
194 | | // Modules annotation can show up when generated automatically for includes. |
195 | 0 | assert(Tok.isOneOf(tok::annot_module_include, tok::annot_module_begin, |
196 | 0 | tok::annot_module_end) && |
197 | 0 | "unexpected annotation in AvoidConcat"); |
198 | 0 | ConcatInfo = 0; |
199 | 0 | } |
200 | | |
201 | 2.17k | if (ConcatInfo == 0) |
202 | 26 | return false; |
203 | | |
204 | | // Basic algorithm: we look at the first character of the second token, and |
205 | | // determine whether it, if appended to the first token, would form (or |
206 | | // would contribute) to a larger token if concatenated. |
207 | 2.14k | char FirstChar = 0; |
208 | 2.14k | if (ConcatInfo & aci_custom) { |
209 | | // If the token does not need to know the first character, don't get it. |
210 | 1.49k | } else { |
211 | 1.49k | FirstChar = GetFirstChar(PP, Tok); |
212 | 1.49k | } |
213 | | |
214 | 2.14k | switch (PrevKind) { |
215 | 0 | default: |
216 | 0 | llvm_unreachable("InitAvoidConcatTokenInfo built wrong"); |
217 | |
|
218 | 0 | case tok::raw_identifier: |
219 | 0 | llvm_unreachable("tok::raw_identifier in non-raw lexing mode!"); |
220 | |
|
221 | 9 | case tok::string_literal: |
222 | 10 | case tok::wide_string_literal: |
223 | 11 | case tok::utf8_string_literal: |
224 | 12 | case tok::utf16_string_literal: |
225 | 13 | case tok::utf32_string_literal: |
226 | 16 | case tok::char_constant: |
227 | 17 | case tok::wide_char_constant: |
228 | 17 | case tok::utf8_char_constant: |
229 | 18 | case tok::utf16_char_constant: |
230 | 19 | case tok::utf32_char_constant: |
231 | 19 | if (!PP.getLangOpts().CPlusPlus11) |
232 | 0 | return false; |
233 | | |
234 | | // In C++11, a string or character literal followed by an identifier is a |
235 | | // single token. |
236 | 19 | if (Tok.getIdentifierInfo()) |
237 | 11 | return true; |
238 | | |
239 | | // A ud-suffix is an identifier. If the previous token ends with one, treat |
240 | | // it as an identifier. |
241 | 8 | if (!PrevTok.hasUDSuffix()) |
242 | 7 | return false; |
243 | 8 | [[fallthrough]];1 |
244 | 637 | case tok::identifier: // id+id or id+number or id+L"foo". |
245 | | // id+'.'... will not append. |
246 | 637 | if (Tok.is(tok::numeric_constant)) |
247 | 13 | return GetFirstChar(PP, Tok) != '.'; |
248 | | |
249 | 624 | if (Tok.getIdentifierInfo() || |
250 | 624 | Tok.isOneOf(tok::wide_string_literal, tok::utf8_string_literal, |
251 | 575 | tok::utf16_string_literal, tok::utf32_string_literal, |
252 | 575 | tok::wide_char_constant, tok::utf8_char_constant, |
253 | 575 | tok::utf16_char_constant, tok::utf32_char_constant)) |
254 | 49 | return true; |
255 | | |
256 | | // If this isn't identifier + string, we're done. |
257 | 575 | if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) |
258 | 573 | return false; |
259 | | |
260 | | // Otherwise, this is a narrow character or string. If the *identifier* |
261 | | // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". |
262 | 2 | return IsIdentifierStringPrefix(PrevTok); |
263 | | |
264 | 985 | case tok::numeric_constant: |
265 | 985 | return isPreprocessingNumberBody(FirstChar) || |
266 | 985 | FirstChar == '+'980 || FirstChar == '-'975 ; |
267 | 4 | case tok::period: // ..., .*, .1234 |
268 | 4 | return (FirstChar == '.' && PrevPrevTok.is(tok::period)2 ) || |
269 | 4 | isDigit(FirstChar)2 || |
270 | 4 | (2 PP.getLangOpts().CPlusPlus2 && FirstChar == '*'0 ); |
271 | 0 | case tok::amp: // && |
272 | 0 | return FirstChar == '&'; |
273 | 4 | case tok::plus: // ++ |
274 | 4 | return FirstChar == '+'; |
275 | 308 | case tok::minus: // --, ->, ->* |
276 | 308 | return FirstChar == '-' || FirstChar == '>'307 ; |
277 | 0 | case tok::slash: //, /*, // |
278 | 0 | return FirstChar == '*' || FirstChar == '/'; |
279 | 100 | case tok::less: // <<, <<=, <:, <% |
280 | 100 | return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; |
281 | 73 | case tok::greater: // >>, >>= |
282 | 73 | return FirstChar == '>'; |
283 | 3 | case tok::pipe: // || |
284 | 3 | return FirstChar == '|'; |
285 | 0 | case tok::percent: // %>, %: |
286 | 0 | return FirstChar == '>' || FirstChar == ':'; |
287 | 8 | case tok::colon: // ::, :> |
288 | 8 | return FirstChar == '>' || |
289 | 8 | (PP.getLangOpts().CPlusPlus && FirstChar == ':'0 ); |
290 | 5 | case tok::hash: // ##, #@, %:%: |
291 | 5 | return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; |
292 | 0 | case tok::arrow: // ->* |
293 | 0 | return PP.getLangOpts().CPlusPlus && FirstChar == '*'; |
294 | 4 | case tok::lessequal: // <=> (C++2a) |
295 | 4 | return PP.getLangOpts().CPlusPlus20 && FirstChar == '>'; |
296 | 2.14k | } |
297 | 2.14k | } |