/Users/buildslave/jenkins/workspace/clang-stage2-coverage-R/llvm/tools/clang/lib/Lex/LiteralSupport.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | //===--- LiteralSupport.cpp - Code to parse and process literals ----------===// |
2 | | // |
3 | | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | | // See https://llvm.org/LICENSE.txt for license information. |
5 | | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | | // |
7 | | //===----------------------------------------------------------------------===// |
8 | | // |
9 | | // This file implements the NumericLiteralParser, CharLiteralParser, and |
10 | | // StringLiteralParser interfaces. |
11 | | // |
12 | | //===----------------------------------------------------------------------===// |
13 | | |
14 | | #include "clang/Lex/LiteralSupport.h" |
15 | | #include "clang/Basic/CharInfo.h" |
16 | | #include "clang/Basic/LangOptions.h" |
17 | | #include "clang/Basic/SourceLocation.h" |
18 | | #include "clang/Basic/TargetInfo.h" |
19 | | #include "clang/Lex/LexDiagnostic.h" |
20 | | #include "clang/Lex/Lexer.h" |
21 | | #include "clang/Lex/Preprocessor.h" |
22 | | #include "clang/Lex/Token.h" |
23 | | #include "llvm/ADT/APInt.h" |
24 | | #include "llvm/ADT/SmallVector.h" |
25 | | #include "llvm/ADT/StringExtras.h" |
26 | | #include "llvm/ADT/StringSwitch.h" |
27 | | #include "llvm/Support/ConvertUTF.h" |
28 | | #include "llvm/Support/ErrorHandling.h" |
29 | | #include <algorithm> |
30 | | #include <cassert> |
31 | | #include <cstddef> |
32 | | #include <cstdint> |
33 | | #include <cstring> |
34 | | #include <string> |
35 | | |
36 | | using namespace clang; |
37 | | |
38 | 3.28M | static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { |
39 | 3.28M | switch (kind) { |
40 | 3.28M | default: 0 llvm_unreachable0 ("Unknown token type!"); |
41 | 3.28M | case tok::char_constant: |
42 | 3.28M | case tok::string_literal: |
43 | 3.28M | case tok::utf8_char_constant: |
44 | 3.28M | case tok::utf8_string_literal: |
45 | 3.28M | return Target.getCharWidth(); |
46 | 3.28M | case tok::wide_char_constant: |
47 | 1.46k | case tok::wide_string_literal: |
48 | 1.46k | return Target.getWCharWidth(); |
49 | 1.46k | case tok::utf16_char_constant: |
50 | 113 | case tok::utf16_string_literal: |
51 | 113 | return Target.getChar16Width(); |
52 | 113 | case tok::utf32_char_constant: |
53 | 106 | case tok::utf32_string_literal: |
54 | 106 | return Target.getChar32Width(); |
55 | 3.28M | } |
56 | 3.28M | } |
57 | | |
58 | | static CharSourceRange MakeCharSourceRange(const LangOptions &Features, |
59 | | FullSourceLoc TokLoc, |
60 | | const char *TokBegin, |
61 | | const char *TokRangeBegin, |
62 | 294 | const char *TokRangeEnd) { |
63 | 294 | SourceLocation Begin = |
64 | 294 | Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, |
65 | 294 | TokLoc.getManager(), Features); |
66 | 294 | SourceLocation End = |
67 | 294 | Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, |
68 | 294 | TokLoc.getManager(), Features); |
69 | 294 | return CharSourceRange::getCharRange(Begin, End); |
70 | 294 | } |
71 | | |
72 | | /// Produce a diagnostic highlighting some portion of a literal. |
73 | | /// |
74 | | /// Emits the diagnostic \p DiagID, highlighting the range of characters from |
75 | | /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be |
76 | | /// a substring of a spelling buffer for the token beginning at \p TokBegin. |
77 | | static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, |
78 | | const LangOptions &Features, FullSourceLoc TokLoc, |
79 | | const char *TokBegin, const char *TokRangeBegin, |
80 | 209 | const char *TokRangeEnd, unsigned DiagID) { |
81 | 209 | SourceLocation Begin = |
82 | 209 | Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, |
83 | 209 | TokLoc.getManager(), Features); |
84 | 209 | return Diags->Report(Begin, DiagID) << |
85 | 209 | MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd); |
86 | 209 | } |
87 | | |
88 | | /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in |
89 | | /// either a character or a string literal. |
90 | | static unsigned ProcessCharEscape(const char *ThisTokBegin, |
91 | | const char *&ThisTokBuf, |
92 | | const char *ThisTokEnd, bool &HadError, |
93 | | FullSourceLoc Loc, unsigned CharWidth, |
94 | | DiagnosticsEngine *Diags, |
95 | 138k | const LangOptions &Features) { |
96 | 138k | const char *EscapeBegin = ThisTokBuf; |
97 | 138k | |
98 | 138k | // Skip the '\' char. |
99 | 138k | ++ThisTokBuf; |
100 | 138k | |
101 | 138k | // We know that this character can't be off the end of the buffer, because |
102 | 138k | // that would have been \", which would not have been the end of string. |
103 | 138k | unsigned ResultChar = *ThisTokBuf++; |
104 | 138k | switch (ResultChar) { |
105 | 138k | // These map to themselves. |
106 | 138k | case '\\': 38.1k case '\'': 38.1k case '"': 38.1k case '?': break38.1k ; |
107 | 38.1k | |
108 | 38.1k | // These have fixed mappings. |
109 | 38.1k | case 'a': |
110 | 24 | // TODO: K&R: the meaning of '\\a' is different in traditional C |
111 | 24 | ResultChar = 7; |
112 | 24 | break; |
113 | 38.1k | case 'b': |
114 | 107 | ResultChar = 8; |
115 | 107 | break; |
116 | 38.1k | case 'e': |
117 | 15 | if (Diags) |
118 | 15 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
119 | 15 | diag::ext_nonstandard_escape) << "e"; |
120 | 15 | ResultChar = 27; |
121 | 15 | break; |
122 | 38.1k | case 'E': |
123 | 1 | if (Diags) |
124 | 1 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
125 | 1 | diag::ext_nonstandard_escape) << "E"; |
126 | 1 | ResultChar = 27; |
127 | 1 | break; |
128 | 38.1k | case 'f': |
129 | 1.34k | ResultChar = 12; |
130 | 1.34k | break; |
131 | 78.8k | case 'n': |
132 | 78.8k | ResultChar = 10; |
133 | 78.8k | break; |
134 | 38.1k | case 'r': |
135 | 1.73k | ResultChar = 13; |
136 | 1.73k | break; |
137 | 38.1k | case 't': |
138 | 5.60k | ResultChar = 9; |
139 | 5.60k | break; |
140 | 38.1k | case 'v': |
141 | 1.45k | ResultChar = 11; |
142 | 1.45k | break; |
143 | 38.1k | case 'x': { // Hex escape. |
144 | 2.50k | ResultChar = 0; |
145 | 2.50k | if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)2.50k ) { |
146 | 6 | if (Diags) |
147 | 6 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
148 | 6 | diag::err_hex_escape_no_digits) << "x"; |
149 | 6 | HadError = true; |
150 | 6 | break; |
151 | 6 | } |
152 | 2.50k | |
153 | 2.50k | // Hex escapes are a maximal series of hex digits. |
154 | 2.50k | bool Overflow = false; |
155 | 7.71k | for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf5.21k ) { |
156 | 6.80k | int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); |
157 | 6.80k | if (CharVal == -1) break1.59k ; |
158 | 5.21k | // About to shift out a digit? |
159 | 5.21k | if (ResultChar & 0xF0000000) |
160 | 0 | Overflow = true; |
161 | 5.21k | ResultChar <<= 4; |
162 | 5.21k | ResultChar |= CharVal; |
163 | 5.21k | } |
164 | 2.50k | |
165 | 2.50k | // See if any bits will be truncated when evaluated as a character. |
166 | 2.50k | if (CharWidth != 32 && (ResultChar >> CharWidth) != 02.38k ) { |
167 | 0 | Overflow = true; |
168 | 0 | ResultChar &= ~0U >> (32-CharWidth); |
169 | 0 | } |
170 | 2.50k | |
171 | 2.50k | // Check for overflow. |
172 | 2.50k | if (Overflow && Diags0 ) // Too many digits to fit in |
173 | 0 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
174 | 0 | diag::err_escape_too_large) << 0; |
175 | 2.50k | break; |
176 | 2.50k | } |
177 | 8.69k | case '0': case '1': case '2': case '3': |
178 | 8.69k | case '4': case '5': case '6': case '7': { |
179 | 8.69k | // Octal escapes. |
180 | 8.69k | --ThisTokBuf; |
181 | 8.69k | ResultChar = 0; |
182 | 8.69k | |
183 | 8.69k | // Octal escapes are a series of octal digits with maximum length 3. |
184 | 8.69k | // "\0123" is a two digit sequence equal to "\012" "3". |
185 | 8.69k | unsigned NumDigits = 0; |
186 | 12.8k | do { |
187 | 12.8k | ResultChar <<= 3; |
188 | 12.8k | ResultChar |= *ThisTokBuf++ - '0'; |
189 | 12.8k | ++NumDigits; |
190 | 12.8k | } while (ThisTokBuf != ThisTokEnd && NumDigits < 36.59k && |
191 | 12.8k | ThisTokBuf[0] >= '0'4.78k && ThisTokBuf[0] <= '7'4.76k ); |
192 | 8.69k | |
193 | 8.69k | // Check for overflow. Reject '\777', but not L'\777'. |
194 | 8.69k | if (CharWidth != 32 && (ResultChar >> CharWidth) != 08.51k ) { |
195 | 1 | if (Diags) |
196 | 1 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
197 | 1 | diag::err_escape_too_large) << 1; |
198 | 1 | ResultChar &= ~0U >> (32-CharWidth); |
199 | 1 | } |
200 | 8.69k | break; |
201 | 8.69k | } |
202 | 8.69k | |
203 | 8.69k | // Otherwise, these are not valid escapes. |
204 | 8.69k | case '(': 28 case '{': 28 case '[': 28 case '%': |
205 | 28 | // GCC accepts these as extensions. We warn about them as such though. |
206 | 28 | if (Diags) |
207 | 22 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
208 | 22 | diag::ext_nonstandard_escape) |
209 | 22 | << std::string(1, ResultChar); |
210 | 28 | break; |
211 | 28 | default: |
212 | 10 | if (!Diags) |
213 | 0 | break; |
214 | 10 | |
215 | 10 | if (isPrintable(ResultChar)) |
216 | 8 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
217 | 8 | diag::ext_unknown_escape) |
218 | 8 | << std::string(1, ResultChar); |
219 | 2 | else |
220 | 2 | Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, |
221 | 2 | diag::ext_unknown_escape) |
222 | 2 | << "x" + llvm::utohexstr(ResultChar); |
223 | 10 | break; |
224 | 138k | } |
225 | 138k | |
226 | 138k | return ResultChar; |
227 | 138k | } |
228 | | |
229 | | static void appendCodePoint(unsigned Codepoint, |
230 | 147 | llvm::SmallVectorImpl<char> &Str) { |
231 | 147 | char ResultBuf[4]; |
232 | 147 | char *ResultPtr = ResultBuf; |
233 | 147 | bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr); |
234 | 147 | (void)Res; |
235 | 147 | assert(Res && "Unexpected conversion failure"); |
236 | 147 | Str.append(ResultBuf, ResultPtr); |
237 | 147 | } |
238 | | |
239 | 760 | void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { |
240 | 2.79k | for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I2.03k ) { |
241 | 2.03k | if (*I != '\\') { |
242 | 1.88k | Buf.push_back(*I); |
243 | 1.88k | continue; |
244 | 1.88k | } |
245 | 147 | |
246 | 147 | ++I; |
247 | 147 | assert(*I == 'u' || *I == 'U'); |
248 | 147 | |
249 | 147 | unsigned NumHexDigits; |
250 | 147 | if (*I == 'u') |
251 | 126 | NumHexDigits = 4; |
252 | 21 | else |
253 | 21 | NumHexDigits = 8; |
254 | 147 | |
255 | 147 | assert(I + NumHexDigits <= E); |
256 | 147 | |
257 | 147 | uint32_t CodePoint = 0; |
258 | 819 | for (++I; NumHexDigits != 0; ++I, --NumHexDigits672 ) { |
259 | 672 | unsigned Value = llvm::hexDigitValue(*I); |
260 | 672 | assert(Value != -1U); |
261 | 672 | |
262 | 672 | CodePoint <<= 4; |
263 | 672 | CodePoint += Value; |
264 | 672 | } |
265 | 147 | |
266 | 147 | appendCodePoint(CodePoint, Buf); |
267 | 147 | --I; |
268 | 147 | } |
269 | 760 | } |
270 | | |
271 | | /// ProcessUCNEscape - Read the Universal Character Name, check constraints and |
272 | | /// return the UTF32. |
273 | | static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, |
274 | | const char *ThisTokEnd, |
275 | | uint32_t &UcnVal, unsigned short &UcnLen, |
276 | | FullSourceLoc Loc, DiagnosticsEngine *Diags, |
277 | | const LangOptions &Features, |
278 | 376 | bool in_char_string_literal = false) { |
279 | 376 | const char *UcnBegin = ThisTokBuf; |
280 | 376 | |
281 | 376 | // Skip the '\u' char's. |
282 | 376 | ThisTokBuf += 2; |
283 | 376 | |
284 | 376 | if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)373 ) { |
285 | 4 | if (Diags) |
286 | 4 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
287 | 4 | diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1); |
288 | 4 | return false; |
289 | 4 | } |
290 | 372 | UcnLen = (ThisTokBuf[-1] == 'u' ? 4284 : 888 ); |
291 | 372 | unsigned short UcnLenSave = UcnLen; |
292 | 2.19k | for (; ThisTokBuf != ThisTokEnd && UcnLenSave1.94k ; ++ThisTokBuf, UcnLenSave--1.82k ) { |
293 | 1.82k | int CharVal = llvm::hexDigitValue(ThisTokBuf[0]); |
294 | 1.82k | if (CharVal == -1) break4 ; |
295 | 1.82k | UcnVal <<= 4; |
296 | 1.82k | UcnVal |= CharVal; |
297 | 1.82k | } |
298 | 372 | // If we didn't consume the proper number of digits, there is a problem. |
299 | 372 | if (UcnLenSave) { |
300 | 6 | if (Diags) |
301 | 6 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
302 | 6 | diag::err_ucn_escape_incomplete); |
303 | 6 | return false; |
304 | 6 | } |
305 | 366 | |
306 | 366 | // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] |
307 | 366 | if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF118 ) || // surrogate codepoints |
308 | 366 | UcnVal > 0x10FFFF350 ) { // maximum legal UTF32 value |
309 | 19 | if (Diags) |
310 | 19 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
311 | 19 | diag::err_ucn_escape_invalid); |
312 | 19 | return false; |
313 | 19 | } |
314 | 347 | |
315 | 347 | // C++11 allows UCNs that refer to control characters and basic source |
316 | 347 | // characters inside character and string literals |
317 | 347 | if (UcnVal < 0xa0 && |
318 | 347 | (111 UcnVal != 0x24111 && UcnVal != 0x40107 && UcnVal != 0x60103 )) { // $, @, ` |
319 | 99 | bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal66 ); |
320 | 99 | if (Diags) { |
321 | 99 | char BasicSCSChar = UcnVal; |
322 | 99 | if (UcnVal >= 0x20 && UcnVal < 0x7f76 ) |
323 | 46 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
324 | 46 | IsError ? diag::err_ucn_escape_basic_scs18 : |
325 | 46 | diag::warn_cxx98_compat_literal_ucn_escape_basic_scs28 ) |
326 | 46 | << StringRef(&BasicSCSChar, 1); |
327 | 53 | else |
328 | 53 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
329 | 53 | IsError ? diag::err_ucn_control_character15 : |
330 | 53 | diag::warn_cxx98_compat_literal_ucn_control_character38 ); |
331 | 99 | } |
332 | 99 | if (IsError) |
333 | 33 | return false; |
334 | 314 | } |
335 | 314 | |
336 | 314 | if (!Features.CPlusPlus && !Features.C99108 && Diags2 ) |
337 | 2 | Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, |
338 | 2 | diag::warn_ucn_not_valid_in_c89_literal); |
339 | 314 | |
340 | 314 | return true; |
341 | 314 | } |
342 | | |
343 | | /// MeasureUCNEscape - Determine the number of bytes within the resulting string |
344 | | /// which this UCN will occupy. |
345 | | static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, |
346 | | const char *ThisTokEnd, unsigned CharByteWidth, |
347 | 6 | const LangOptions &Features, bool &HadError) { |
348 | 6 | // UTF-32: 4 bytes per escape. |
349 | 6 | if (CharByteWidth == 4) |
350 | 0 | return 4; |
351 | 6 | |
352 | 6 | uint32_t UcnVal = 0; |
353 | 6 | unsigned short UcnLen = 0; |
354 | 6 | FullSourceLoc Loc; |
355 | 6 | |
356 | 6 | if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, |
357 | 6 | UcnLen, Loc, nullptr, Features, true)) { |
358 | 0 | HadError = true; |
359 | 0 | return 0; |
360 | 0 | } |
361 | 6 | |
362 | 6 | // UTF-16: 2 bytes for BMP, 4 bytes otherwise. |
363 | 6 | if (CharByteWidth == 2) |
364 | 0 | return UcnVal <= 0xFFFF ? 2 : 4; |
365 | 6 | |
366 | 6 | // UTF-8. |
367 | 6 | if (UcnVal < 0x80) |
368 | 0 | return 1; |
369 | 6 | if (UcnVal < 0x800) |
370 | 0 | return 2; |
371 | 6 | if (UcnVal < 0x10000) |
372 | 3 | return 3; |
373 | 3 | return 4; |
374 | 3 | } |
375 | | |
376 | | /// EncodeUCNEscape - Read the Universal Character Name, check constraints and |
377 | | /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of |
378 | | /// StringLiteralParser. When we decide to implement UCN's for identifiers, |
379 | | /// we will likely rework our support for UCN's. |
380 | | static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, |
381 | | const char *ThisTokEnd, |
382 | | char *&ResultBuf, bool &HadError, |
383 | | FullSourceLoc Loc, unsigned CharByteWidth, |
384 | | DiagnosticsEngine *Diags, |
385 | 265 | const LangOptions &Features) { |
386 | 265 | typedef uint32_t UTF32; |
387 | 265 | UTF32 UcnVal = 0; |
388 | 265 | unsigned short UcnLen = 0; |
389 | 265 | if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, |
390 | 265 | Loc, Diags, Features, true)) { |
391 | 33 | HadError = true; |
392 | 33 | return; |
393 | 33 | } |
394 | 232 | |
395 | 232 | assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && |
396 | 232 | "only character widths of 1, 2, or 4 bytes supported"); |
397 | 232 | |
398 | 232 | (void)UcnLen; |
399 | 232 | assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); |
400 | 232 | |
401 | 232 | if (CharByteWidth == 4) { |
402 | 65 | // FIXME: Make the type of the result buffer correct instead of |
403 | 65 | // using reinterpret_cast. |
404 | 65 | llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf); |
405 | 65 | *ResultPtr = UcnVal; |
406 | 65 | ResultBuf += 4; |
407 | 65 | return; |
408 | 65 | } |
409 | 167 | |
410 | 167 | if (CharByteWidth == 2) { |
411 | 53 | // FIXME: Make the type of the result buffer correct instead of |
412 | 53 | // using reinterpret_cast. |
413 | 53 | llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf); |
414 | 53 | |
415 | 53 | if (UcnVal <= (UTF32)0xFFFF) { |
416 | 39 | *ResultPtr = UcnVal; |
417 | 39 | ResultBuf += 2; |
418 | 39 | return; |
419 | 39 | } |
420 | 14 | |
421 | 14 | // Convert to UTF16. |
422 | 14 | UcnVal -= 0x10000; |
423 | 14 | *ResultPtr = 0xD800 + (UcnVal >> 10); |
424 | 14 | *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF); |
425 | 14 | ResultBuf += 4; |
426 | 14 | return; |
427 | 14 | } |
428 | 114 | |
429 | 114 | assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); |
430 | 114 | |
431 | 114 | // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. |
432 | 114 | // The conversion below was inspired by: |
433 | 114 | // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c |
434 | 114 | // First, we determine how many bytes the result will require. |
435 | 114 | typedef uint8_t UTF8; |
436 | 114 | |
437 | 114 | unsigned short bytesToWrite = 0; |
438 | 114 | if (UcnVal < (UTF32)0x80) |
439 | 27 | bytesToWrite = 1; |
440 | 87 | else if (UcnVal < (UTF32)0x800) |
441 | 15 | bytesToWrite = 2; |
442 | 72 | else if (UcnVal < (UTF32)0x10000) |
443 | 48 | bytesToWrite = 3; |
444 | 24 | else |
445 | 24 | bytesToWrite = 4; |
446 | 114 | |
447 | 114 | const unsigned byteMask = 0xBF; |
448 | 114 | const unsigned byteMark = 0x80; |
449 | 114 | |
450 | 114 | // Once the bits are split out into bytes of UTF8, this is a mask OR-ed |
451 | 114 | // into the first byte, depending on how many bytes follow. |
452 | 114 | static const UTF8 firstByteMark[5] = { |
453 | 114 | 0x00, 0x00, 0xC0, 0xE0, 0xF0 |
454 | 114 | }; |
455 | 114 | // Finally, we write the bytes into ResultBuf. |
456 | 114 | ResultBuf += bytesToWrite; |
457 | 114 | switch (bytesToWrite) { // note: everything falls through. |
458 | 114 | case 4: |
459 | 24 | *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; |
460 | 24 | LLVM_FALLTHROUGH; |
461 | 72 | case 3: |
462 | 72 | *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; |
463 | 72 | LLVM_FALLTHROUGH; |
464 | 87 | case 2: |
465 | 87 | *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; |
466 | 87 | LLVM_FALLTHROUGH; |
467 | 114 | case 1: |
468 | 114 | *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); |
469 | 114 | } |
470 | 114 | // Update the buffer. |
471 | 114 | ResultBuf += bytesToWrite; |
472 | 114 | } |
473 | | |
474 | | /// integer-constant: [C99 6.4.4.1] |
475 | | /// decimal-constant integer-suffix |
476 | | /// octal-constant integer-suffix |
477 | | /// hexadecimal-constant integer-suffix |
478 | | /// binary-literal integer-suffix [GNU, C++1y] |
479 | | /// user-defined-integer-literal: [C++11 lex.ext] |
480 | | /// decimal-literal ud-suffix |
481 | | /// octal-literal ud-suffix |
482 | | /// hexadecimal-literal ud-suffix |
483 | | /// binary-literal ud-suffix [GNU, C++1y] |
484 | | /// decimal-constant: |
485 | | /// nonzero-digit |
486 | | /// decimal-constant digit |
487 | | /// octal-constant: |
488 | | /// 0 |
489 | | /// octal-constant octal-digit |
490 | | /// hexadecimal-constant: |
491 | | /// hexadecimal-prefix hexadecimal-digit |
492 | | /// hexadecimal-constant hexadecimal-digit |
493 | | /// hexadecimal-prefix: one of |
494 | | /// 0x 0X |
495 | | /// binary-literal: |
496 | | /// 0b binary-digit |
497 | | /// 0B binary-digit |
498 | | /// binary-literal binary-digit |
499 | | /// integer-suffix: |
500 | | /// unsigned-suffix [long-suffix] |
501 | | /// unsigned-suffix [long-long-suffix] |
502 | | /// long-suffix [unsigned-suffix] |
503 | | /// long-long-suffix [unsigned-sufix] |
504 | | /// nonzero-digit: |
505 | | /// 1 2 3 4 5 6 7 8 9 |
506 | | /// octal-digit: |
507 | | /// 0 1 2 3 4 5 6 7 |
508 | | /// hexadecimal-digit: |
509 | | /// 0 1 2 3 4 5 6 7 8 9 |
510 | | /// a b c d e f |
511 | | /// A B C D E F |
512 | | /// binary-digit: |
513 | | /// 0 |
514 | | /// 1 |
515 | | /// unsigned-suffix: one of |
516 | | /// u U |
517 | | /// long-suffix: one of |
518 | | /// l L |
519 | | /// long-long-suffix: one of |
520 | | /// ll LL |
521 | | /// |
522 | | /// floating-constant: [C99 6.4.4.2] |
523 | | /// TODO: add rules... |
524 | | /// |
525 | | NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, |
526 | | SourceLocation TokLoc, |
527 | | Preprocessor &PP) |
528 | 6.65M | : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) { |
529 | 6.65M | |
530 | 6.65M | // This routine assumes that the range begin/end matches the regex for integer |
531 | 6.65M | // and FP constants (specifically, the 'pp-number' regex), and assumes that |
532 | 6.65M | // the byte at "*end" is both valid and not part of the regex. Because of |
533 | 6.65M | // this, it doesn't have to check for 'overscan' in various places. |
534 | 6.65M | assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?"); |
535 | 6.65M | |
536 | 6.65M | s = DigitsBegin = ThisTokBegin; |
537 | 6.65M | saw_exponent = false; |
538 | 6.65M | saw_period = false; |
539 | 6.65M | saw_ud_suffix = false; |
540 | 6.65M | saw_fixed_point_suffix = false; |
541 | 6.65M | isLong = false; |
542 | 6.65M | isUnsigned = false; |
543 | 6.65M | isLongLong = false; |
544 | 6.65M | isHalf = false; |
545 | 6.65M | isFloat = false; |
546 | 6.65M | isImaginary = false; |
547 | 6.65M | isFloat16 = false; |
548 | 6.65M | isFloat128 = false; |
549 | 6.65M | MicrosoftInteger = 0; |
550 | 6.65M | isFract = false; |
551 | 6.65M | isAccum = false; |
552 | 6.65M | hadError = false; |
553 | 6.65M | |
554 | 6.65M | if (*s == '0') { // parse radix |
555 | 902k | ParseNumberStartingWithZero(TokLoc); |
556 | 902k | if (hadError) |
557 | 29 | return; |
558 | 5.75M | } else { // the first digit is non-zero |
559 | 5.75M | radix = 10; |
560 | 5.75M | s = SkipDigits(s); |
561 | 5.75M | if (s == ThisTokEnd) { |
562 | 5.19M | // Done. |
563 | 5.19M | } else { |
564 | 563k | ParseDecimalOrOctalCommon(TokLoc); |
565 | 563k | if (hadError) |
566 | 7 | return; |
567 | 6.65M | } |
568 | 5.75M | } |
569 | 6.65M | |
570 | 6.65M | SuffixBegin = s; |
571 | 6.65M | checkSeparator(TokLoc, s, CSK_AfterDigits); |
572 | 6.65M | |
573 | 6.65M | // Initial scan to lookahead for fixed point suffix. |
574 | 6.65M | if (PP.getLangOpts().FixedPoint) { |
575 | 946 | for (const char *c = s; c != ThisTokEnd; ++c474 ) { |
576 | 925 | if (*c == 'r' || *c == 'k'807 || *c == 'R'474 || *c == 'K'474 ) { |
577 | 451 | saw_fixed_point_suffix = true; |
578 | 451 | break; |
579 | 451 | } |
580 | 925 | } |
581 | 472 | } |
582 | 6.65M | |
583 | 6.65M | // Parse the suffix. At this point we can classify whether we have an FP or |
584 | 6.65M | // integer constant. |
585 | 6.65M | bool isFPConstant = isFloatingLiteral(); |
586 | 6.65M | |
587 | 6.65M | // Loop over all of the characters of the suffix. If we see something bad, |
588 | 6.65M | // we break out of the loop. |
589 | 7.44M | for (; s != ThisTokEnd; ++s785k ) { |
590 | 785k | switch (*s) { |
591 | 785k | case 'R': |
592 | 124 | case 'r': |
593 | 124 | if (!PP.getLangOpts().FixedPoint) break6 ; |
594 | 118 | if (isFract || isAccum117 ) break1 ; |
595 | 117 | if (!(saw_period || saw_exponent26 )) break20 ; |
596 | 97 | isFract = true; |
597 | 97 | continue; |
598 | 338 | case 'K': |
599 | 338 | case 'k': |
600 | 338 | if (!PP.getLangOpts().FixedPoint) break6 ; |
601 | 332 | if (isFract || isAccum331 ) break2 ; |
602 | 330 | if (!(saw_period || saw_exponent43 )) break23 ; |
603 | 307 | isAccum = true; |
604 | 307 | continue; |
605 | 307 | case 'h': // FP Suffix for "half". |
606 | 248 | case 'H': |
607 | 248 | // OpenCL Extension v1.2 s9.5 - h or H suffix for half type. |
608 | 248 | if (!(PP.getLangOpts().Half || PP.getLangOpts().FixedPoint233 )) break3 ; |
609 | 245 | if (isIntegerLiteral()) break0 ; // Error for integer constant. |
610 | 245 | if (isHalf || isFloat244 || isLong243 ) break3 ; // HH, FH, LH invalid. |
611 | 242 | isHalf = true; |
612 | 242 | continue; // Success. |
613 | 164k | case 'f': // FP Suffix for "float" |
614 | 164k | case 'F': |
615 | 164k | if (!isFPConstant) break4 ; // Error for integer constant. |
616 | 164k | if (isHalf || isFloat164k || isLong164k || isFloat128164k ) |
617 | 1 | break; // HF, FF, LF, QF invalid. |
618 | 164k | |
619 | 164k | // CUDA host and device may have different _Float16 support, therefore |
620 | 164k | // allows f16 literals to avoid false alarm. |
621 | 164k | // ToDo: more precise check for CUDA. |
622 | 164k | if ((PP.getTargetInfo().hasFloat16Type() || PP.getLangOpts().CUDA2.78k ) && |
623 | 164k | s + 2 < ThisTokEnd161k && s[1] == '1'122 && s[2] == '6'122 ) { |
624 | 122 | s += 2; // success, eat up 2 characters. |
625 | 122 | isFloat16 = true; |
626 | 122 | continue; |
627 | 122 | } |
628 | 164k | |
629 | 164k | isFloat = true; |
630 | 164k | continue; // Success. |
631 | 164k | case 'q': // FP Suffix for "__float128" |
632 | 171 | case 'Q': |
633 | 171 | if (!isFPConstant) break1 ; // Error for integer constant. |
634 | 170 | if (isHalf || isFloat || isLong || isFloat128) |
635 | 0 | break; // HQ, FQ, LQ, QQ invalid. |
636 | 170 | isFloat128 = true; |
637 | 170 | continue; // Success. |
638 | 108k | case 'u': |
639 | 108k | case 'U': |
640 | 108k | if (isFPConstant) break0 ; // Error for floating constant. |
641 | 108k | if (isUnsigned) break0 ; // Cannot be repeated. |
642 | 108k | isUnsigned = true; |
643 | 108k | continue; // Success. |
644 | 510k | case 'l': |
645 | 510k | case 'L': |
646 | 510k | if (isLong || isLongLong) break0 ; // Cannot be repeated. |
647 | 510k | if (isHalf || isFloat || isFloat128) break0 ; // LH, LF, LQ invalid. |
648 | 510k | |
649 | 510k | // Check for long long. The L's need to be adjacent and the same case. |
650 | 510k | if (s[1] == s[0]) { |
651 | 62.5k | assert(s + 1 < ThisTokEnd && "didn't maximally munch?"); |
652 | 62.5k | if (isFPConstant) break0 ; // long long invalid for floats. |
653 | 62.5k | isLongLong = true; |
654 | 62.5k | ++s; // Eat both of them. |
655 | 448k | } else { |
656 | 448k | isLong = true; |
657 | 448k | } |
658 | 510k | continue; // Success. |
659 | 510k | case 'i': |
660 | 309 | case 'I': |
661 | 309 | if (PP.getLangOpts().MicrosoftExt) { |
662 | 68 | if (isLong || isLongLong65 || MicrosoftInteger62 ) |
663 | 6 | break; |
664 | 62 | |
665 | 62 | if (!isFPConstant) { |
666 | 54 | // Allow i8, i16, i32, and i64. |
667 | 54 | switch (s[1]) { |
668 | 54 | case '8': |
669 | 10 | s += 2; // i8 suffix |
670 | 10 | MicrosoftInteger = 8; |
671 | 10 | break; |
672 | 54 | case '1': |
673 | 9 | if (s[2] == '6') { |
674 | 9 | s += 3; // i16 suffix |
675 | 9 | MicrosoftInteger = 16; |
676 | 9 | } |
677 | 9 | break; |
678 | 54 | case '3': |
679 | 9 | if (s[2] == '2') { |
680 | 9 | s += 3; // i32 suffix |
681 | 9 | MicrosoftInteger = 32; |
682 | 9 | } |
683 | 9 | break; |
684 | 54 | case '6': |
685 | 23 | if (s[2] == '4') { |
686 | 23 | s += 3; // i64 suffix |
687 | 23 | MicrosoftInteger = 64; |
688 | 23 | } |
689 | 23 | break; |
690 | 54 | default: |
691 | 3 | break; |
692 | 62 | } |
693 | 62 | } |
694 | 62 | if (MicrosoftInteger) { |
695 | 51 | assert(s <= ThisTokEnd && "didn't maximally munch?"); |
696 | 51 | break; |
697 | 51 | } |
698 | 252 | } |
699 | 252 | LLVM_FALLTHROUGH; |
700 | 380 | case 'j': |
701 | 380 | case 'J': |
702 | 380 | if (isImaginary) break0 ; // Cannot be repeated. |
703 | 380 | isImaginary = true; |
704 | 380 | continue; // Success. |
705 | 248 | } |
706 | 248 | // If we reached here, there was an error or a ud-suffix. |
707 | 248 | break; |
708 | 248 | } |
709 | 6.65M | |
710 | 6.65M | // "i", "if", and "il" are user-defined suffixes in C++1y. |
711 | 6.65M | if (s != ThisTokEnd || isImaginary6.65M ) { |
712 | 577 | // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). |
713 | 577 | expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); |
714 | 577 | if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) { |
715 | 139 | if (!isImaginary) { |
716 | 101 | // Any suffix pieces we might have parsed are actually part of the |
717 | 101 | // ud-suffix. |
718 | 101 | isLong = false; |
719 | 101 | isUnsigned = false; |
720 | 101 | isLongLong = false; |
721 | 101 | isFloat = false; |
722 | 101 | isFloat16 = false; |
723 | 101 | isHalf = false; |
724 | 101 | isImaginary = false; |
725 | 101 | MicrosoftInteger = 0; |
726 | 101 | saw_fixed_point_suffix = false; |
727 | 101 | isFract = false; |
728 | 101 | isAccum = false; |
729 | 101 | } |
730 | 139 | |
731 | 139 | saw_ud_suffix = true; |
732 | 139 | return; |
733 | 139 | } |
734 | 438 | |
735 | 438 | if (s != ThisTokEnd) { |
736 | 96 | // Report an error if there are any. |
737 | 96 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin), |
738 | 96 | diag::err_invalid_suffix_constant) |
739 | 96 | << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << isFPConstant; |
740 | 96 | hadError = true; |
741 | 96 | } |
742 | 438 | } |
743 | 6.65M | |
744 | 6.65M | if (6.65M !hadError6.65M && saw_fixed_point_suffix6.65M ) { |
745 | 401 | assert(isFract || isAccum); |
746 | 401 | } |
747 | 6.65M | } |
748 | | |
749 | | /// ParseDecimalOrOctalCommon - This method is called for decimal or octal |
750 | | /// numbers. It issues an error for illegal digits, and handles floating point |
751 | | /// parsing. If it detects a floating point number, the radix is set to 10. |
752 | 711k | void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ |
753 | 711k | assert((radix == 8 || radix == 10) && "Unexpected radix"); |
754 | 711k | |
755 | 711k | // If we have a hex digit other than 'e' (which denotes a FP exponent) then |
756 | 711k | // the code is using an incorrect base. |
757 | 711k | if (isHexDigit(*s) && *s != 'e'1.11k && *s != 'E'42 && |
758 | 711k | !isValidUDSuffix(PP.getLangOpts(), StringRef(s, ThisTokEnd - s))10 ) { |
759 | 8 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), |
760 | 8 | diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 17 : 01 ); |
761 | 8 | hadError = true; |
762 | 8 | return; |
763 | 8 | } |
764 | 711k | |
765 | 711k | if (*s == '.') { |
766 | 241k | checkSeparator(TokLoc, s, CSK_AfterDigits); |
767 | 241k | s++; |
768 | 241k | radix = 10; |
769 | 241k | saw_period = true; |
770 | 241k | checkSeparator(TokLoc, s, CSK_BeforeDigits); |
771 | 241k | s = SkipDigits(s); // Skip suffix. |
772 | 241k | } |
773 | 711k | if (*s == 'e' || *s == 'E'692k ) { // exponent |
774 | 19.5k | checkSeparator(TokLoc, s, CSK_AfterDigits); |
775 | 19.5k | const char *Exponent = s; |
776 | 19.5k | s++; |
777 | 19.5k | radix = 10; |
778 | 19.5k | saw_exponent = true; |
779 | 19.5k | if (s != ThisTokEnd && (19.5k *s == '+'19.5k || *s == '-'16.8k )) s++18.4k ; // sign |
780 | 19.5k | const char *first_non_digit = SkipDigits(s); |
781 | 19.5k | if (containsDigits(s, first_non_digit)) { |
782 | 19.5k | checkSeparator(TokLoc, s, CSK_BeforeDigits); |
783 | 19.5k | s = first_non_digit; |
784 | 19.5k | } else { |
785 | 4 | if (!hadError) { |
786 | 3 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), |
787 | 3 | diag::err_exponent_has_no_digits); |
788 | 3 | hadError = true; |
789 | 3 | } |
790 | 4 | return; |
791 | 4 | } |
792 | 19.5k | } |
793 | 711k | } |
794 | | |
795 | | /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved |
796 | | /// suffixes as ud-suffixes, because the diagnostic experience is better if we |
797 | | /// treat it as an invalid suffix. |
798 | | bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, |
799 | 1.99k | StringRef Suffix) { |
800 | 1.99k | if (!LangOpts.CPlusPlus11 || Suffix.empty()1.71k ) |
801 | 275 | return false; |
802 | 1.71k | |
803 | 1.71k | // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid. |
804 | 1.71k | if (Suffix[0] == '_') |
805 | 87 | return true; |
806 | 1.63k | |
807 | 1.63k | // In C++11, there are no library suffixes. |
808 | 1.63k | if (!LangOpts.CPlusPlus14) |
809 | 22 | return false; |
810 | 1.60k | |
811 | 1.60k | // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library. |
812 | 1.60k | // Per tweaked N3660, "il", "i", and "if" are also used in the library. |
813 | 1.60k | // In C++2a "d" and "y" are used in the library. |
814 | 1.60k | return llvm::StringSwitch<bool>(Suffix) |
815 | 1.60k | .Cases("h", "min", "s", true) |
816 | 1.60k | .Cases("ms", "us", "ns", true) |
817 | 1.60k | .Cases("il", "i", "if", true) |
818 | 1.60k | .Cases("d", "y", LangOpts.CPlusPlus2a) |
819 | 1.60k | .Default(false); |
820 | 1.60k | } |
821 | | |
822 | | void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, |
823 | | const char *Pos, |
824 | 7.17M | CheckSeparatorKind IsAfterDigits) { |
825 | 7.17M | if (IsAfterDigits == CSK_AfterDigits) { |
826 | 6.91M | if (Pos == ThisTokBegin) |
827 | 1.09k | return; |
828 | 6.91M | --Pos; |
829 | 6.91M | } else if (261k Pos == ThisTokEnd261k ) |
830 | 7.50k | return; |
831 | 7.17M | |
832 | 7.17M | if (isDigitSeparator(*Pos)) { |
833 | 20 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin), |
834 | 20 | diag::err_digit_separator_not_between_digits) |
835 | 20 | << IsAfterDigits; |
836 | 20 | hadError = true; |
837 | 20 | } |
838 | 7.17M | } |
839 | | |
840 | | /// ParseNumberStartingWithZero - This method is called when the first character |
841 | | /// of the number is found to be a zero. This means it is either an octal |
842 | | /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or |
843 | | /// a floating point number (01239.123e4). Eat the prefix, determining the |
844 | | /// radix etc. |
845 | 902k | void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { |
846 | 902k | assert(s[0] == '0' && "Invalid method call"); |
847 | 902k | s++; |
848 | 902k | |
849 | 902k | int c1 = s[0]; |
850 | 902k | |
851 | 902k | // Handle a hex number like 0x1234. |
852 | 902k | if ((c1 == 'x' || c1 == 'X'532k ) && (370k isHexDigit(s[1])370k || s[1] == '.'14 )) { |
853 | 370k | s++; |
854 | 370k | assert(s < ThisTokEnd && "didn't maximally munch?"); |
855 | 370k | radix = 16; |
856 | 370k | DigitsBegin = s; |
857 | 370k | s = SkipHexDigits(s); |
858 | 370k | bool HasSignificandDigits = containsDigits(DigitsBegin, s); |
859 | 370k | if (s == ThisTokEnd) { |
860 | 288k | // Done. |
861 | 288k | } else if (81.7k *s == '.'81.7k ) { |
862 | 58 | s++; |
863 | 58 | saw_period = true; |
864 | 58 | const char *floatDigitsBegin = s; |
865 | 58 | s = SkipHexDigits(s); |
866 | 58 | if (containsDigits(floatDigitsBegin, s)) |
867 | 49 | HasSignificandDigits = true; |
868 | 58 | if (HasSignificandDigits) |
869 | 53 | checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits); |
870 | 58 | } |
871 | 370k | |
872 | 370k | if (!HasSignificandDigits) { |
873 | 5 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), |
874 | 5 | diag::err_hex_constant_requires) |
875 | 5 | << PP.getLangOpts().CPlusPlus << 1; |
876 | 5 | hadError = true; |
877 | 5 | return; |
878 | 5 | } |
879 | 370k | |
880 | 370k | // A binary exponent can appear with or with a '.'. If dotted, the |
881 | 370k | // binary exponent is required. |
882 | 370k | if (*s == 'p' || *s == 'P'370k ) { |
883 | 82 | checkSeparator(TokLoc, s, CSK_AfterDigits); |
884 | 82 | const char *Exponent = s; |
885 | 82 | s++; |
886 | 82 | saw_exponent = true; |
887 | 82 | if (s != ThisTokEnd && (81 *s == '+'81 || *s == '-'61 )) s++40 ; // sign |
888 | 82 | const char *first_non_digit = SkipDigits(s); |
889 | 82 | if (!containsDigits(s, first_non_digit)) { |
890 | 2 | if (!hadError) { |
891 | 1 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), |
892 | 1 | diag::err_exponent_has_no_digits); |
893 | 1 | hadError = true; |
894 | 1 | } |
895 | 2 | return; |
896 | 2 | } |
897 | 80 | checkSeparator(TokLoc, s, CSK_BeforeDigits); |
898 | 80 | s = first_non_digit; |
899 | 80 | |
900 | 80 | if (!PP.getLangOpts().HexFloats) |
901 | 21 | PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus |
902 | 21 | ? diag::ext_hex_literal_invalid18 |
903 | 21 | : diag::ext_hex_constant_invalid3 ); |
904 | 59 | else if (PP.getLangOpts().CPlusPlus17) |
905 | 4 | PP.Diag(TokLoc, diag::warn_cxx17_hex_literal); |
906 | 370k | } else if (saw_period) { |
907 | 2 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin), |
908 | 2 | diag::err_hex_constant_requires) |
909 | 2 | << PP.getLangOpts().CPlusPlus << 0; |
910 | 2 | hadError = true; |
911 | 2 | } |
912 | 370k | return370k ; |
913 | 532k | } |
914 | 532k | |
915 | 532k | // Handle simple binary numbers 0b01010 |
916 | 532k | if ((c1 == 'b' || c1 == 'B'532k ) && (32 s[1] == '0'32 || s[1] == '1'23 )) { |
917 | 28 | // 0b101010 is a C++1y / GCC extension. |
918 | 28 | PP.Diag(TokLoc, |
919 | 28 | PP.getLangOpts().CPlusPlus14 |
920 | 28 | ? diag::warn_cxx11_compat_binary_literal15 |
921 | 28 | : PP.getLangOpts().CPlusPlus |
922 | 13 | ? diag::ext_binary_literal_cxx146 |
923 | 13 | : diag::ext_binary_literal7 ); |
924 | 28 | ++s; |
925 | 28 | assert(s < ThisTokEnd && "didn't maximally munch?"); |
926 | 28 | radix = 2; |
927 | 28 | DigitsBegin = s; |
928 | 28 | s = SkipBinaryDigits(s); |
929 | 28 | if (s == ThisTokEnd) { |
930 | 21 | // Done. |
931 | 21 | } else if (7 isHexDigit(*s)7 && |
932 | 7 | !isValidUDSuffix(PP.getLangOpts(), |
933 | 3 | StringRef(s, ThisTokEnd - s))) { |
934 | 2 | PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), |
935 | 2 | diag::err_invalid_digit) << StringRef(s, 1) << 2; |
936 | 2 | hadError = true; |
937 | 2 | } |
938 | 28 | // Other suffixes will be diagnosed by the caller. |
939 | 28 | return; |
940 | 28 | } |
941 | 532k | |
942 | 532k | // For now, the radix is set to 8. If we discover that we have a |
943 | 532k | // floating point constant, the radix will change to 10. Octal floating |
944 | 532k | // point constants are not permitted (only decimal and hexadecimal). |
945 | 532k | radix = 8; |
946 | 532k | DigitsBegin = s; |
947 | 532k | s = SkipOctalDigits(s); |
948 | 532k | if (s == ThisTokEnd) |
949 | 384k | return; // Done, simple octal number like 01234 |
950 | 148k | |
951 | 148k | // If we have some other non-octal digit that *is* a decimal digit, see if |
952 | 148k | // this is part of a floating point number like 094.123 or 09e1. |
953 | 148k | if (isDigit(*s)) { |
954 | 3 | const char *EndDecimal = SkipDigits(s); |
955 | 3 | if (EndDecimal[0] == '.' || EndDecimal[0] == 'e'2 || EndDecimal[0] == 'E'2 ) { |
956 | 1 | s = EndDecimal; |
957 | 1 | radix = 10; |
958 | 1 | } |
959 | 3 | } |
960 | 148k | |
961 | 148k | ParseDecimalOrOctalCommon(TokLoc); |
962 | 148k | } |
963 | | |
964 | 6.41M | static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { |
965 | 6.41M | switch (Radix) { |
966 | 6.41M | case 2: |
967 | 25 | return NumDigits <= 64; |
968 | 6.41M | case 8: |
969 | 400k | return NumDigits <= 64 / 3; // Digits are groups of 3 bits. |
970 | 6.41M | case 10: |
971 | 5.64M | return NumDigits <= 19; // floor(log10(2^64)) |
972 | 6.41M | case 16: |
973 | 370k | return NumDigits <= 64 / 4; // Digits are groups of 4 bits. |
974 | 6.41M | default: |
975 | 0 | llvm_unreachable("impossible Radix"); |
976 | 6.41M | } |
977 | 6.41M | } |
978 | | |
979 | | /// GetIntegerValue - Convert this numeric literal value to an APInt that |
980 | | /// matches Val's input width. If there is an overflow, set Val to the low bits |
981 | | /// of the result and return true. Otherwise, return false. |
982 | 6.41M | bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { |
983 | 6.41M | // Fast path: Compute a conservative bound on the maximum number of |
984 | 6.41M | // bits per digit in this radix. If we can't possibly overflow a |
985 | 6.41M | // uint64 based on that bound then do the simple conversion to |
986 | 6.41M | // integer. This avoids the expensive overflow checking below, and |
987 | 6.41M | // handles the common cases that matter (small decimal integers and |
988 | 6.41M | // hex/octal values which don't overflow). |
989 | 6.41M | const unsigned NumDigits = SuffixBegin - DigitsBegin; |
990 | 6.41M | if (alwaysFitsInto64Bits(radix, NumDigits)) { |
991 | 6.41M | uint64_t N = 0; |
992 | 21.2M | for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr14.8M ) |
993 | 14.8M | if (!isDigitSeparator(*Ptr)) |
994 | 14.8M | N = N * radix + llvm::hexDigitValue(*Ptr); |
995 | 6.41M | |
996 | 6.41M | // This will truncate the value to Val's input width. Simply check |
997 | 6.41M | // for overflow by comparing. |
998 | 6.41M | Val = N; |
999 | 6.41M | return Val.getZExtValue() != N; |
1000 | 6.41M | } |
1001 | 245 | |
1002 | 245 | Val = 0; |
1003 | 245 | const char *Ptr = DigitsBegin; |
1004 | 245 | |
1005 | 245 | llvm::APInt RadixVal(Val.getBitWidth(), radix); |
1006 | 245 | llvm::APInt CharVal(Val.getBitWidth(), 0); |
1007 | 245 | llvm::APInt OldVal = Val; |
1008 | 245 | |
1009 | 245 | bool OverflowOccurred = false; |
1010 | 5.18k | while (Ptr < SuffixBegin) { |
1011 | 4.93k | if (isDigitSeparator(*Ptr)) { |
1012 | 15 | ++Ptr; |
1013 | 15 | continue; |
1014 | 15 | } |
1015 | 4.92k | |
1016 | 4.92k | unsigned C = llvm::hexDigitValue(*Ptr++); |
1017 | 4.92k | |
1018 | 4.92k | // If this letter is out of bound for this radix, reject it. |
1019 | 4.92k | assert(C < radix && "NumericLiteralParser ctor should have rejected this"); |
1020 | 4.92k | |
1021 | 4.92k | CharVal = C; |
1022 | 4.92k | |
1023 | 4.92k | // Add the digit to the value in the appropriate radix. If adding in digits |
1024 | 4.92k | // made the value smaller, then this overflowed. |
1025 | 4.92k | OldVal = Val; |
1026 | 4.92k | |
1027 | 4.92k | // Multiply by radix, did overflow occur on the multiply? |
1028 | 4.92k | Val *= RadixVal; |
1029 | 4.92k | OverflowOccurred |= Val.udiv(RadixVal) != OldVal; |
1030 | 4.92k | |
1031 | 4.92k | // Add value, did overflow occur on the value? |
1032 | 4.92k | // (a + b) ult b <=> overflow |
1033 | 4.92k | Val += CharVal; |
1034 | 4.92k | OverflowOccurred |= Val.ult(CharVal); |
1035 | 4.92k | } |
1036 | 245 | return OverflowOccurred; |
1037 | 245 | } |
1038 | | |
1039 | | llvm::APFloat::opStatus |
1040 | 242k | NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { |
1041 | 242k | using llvm::APFloat; |
1042 | 242k | |
1043 | 242k | unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); |
1044 | 242k | |
1045 | 242k | llvm::SmallString<16> Buffer; |
1046 | 242k | StringRef Str(ThisTokBegin, n); |
1047 | 242k | if (Str.find('\'') != StringRef::npos) { |
1048 | 4 | Buffer.reserve(n); |
1049 | 4 | std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer), |
1050 | 4 | &isDigitSeparator); |
1051 | 4 | Str = Buffer; |
1052 | 4 | } |
1053 | 242k | |
1054 | 242k | return Result.convertFromString(Str, APFloat::rmNearestTiesToEven); |
1055 | 242k | } |
1056 | | |
1057 | 211 | static inline bool IsExponentPart(char c) { |
1058 | 211 | return c == 'p' || c == 'P'177 || c == 'e'175 || c == 'E'158 ; |
1059 | 211 | } |
1060 | | |
1061 | 401 | bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { |
1062 | 401 | assert(radix == 16 || radix == 10); |
1063 | 401 | |
1064 | 401 | // Find how many digits are needed to store the whole literal. |
1065 | 401 | unsigned NumDigits = SuffixBegin - DigitsBegin; |
1066 | 401 | if (saw_period) --NumDigits375 ; |
1067 | 401 | |
1068 | 401 | // Initial scan of the exponent if it exists |
1069 | 401 | bool ExpOverflowOccurred = false; |
1070 | 401 | bool NegativeExponent = false; |
1071 | 401 | const char *ExponentBegin; |
1072 | 401 | uint64_t Exponent = 0; |
1073 | 401 | int64_t BaseShift = 0; |
1074 | 401 | if (saw_exponent) { |
1075 | 55 | const char *Ptr = DigitsBegin; |
1076 | 55 | |
1077 | 211 | while (!IsExponentPart(*Ptr)) ++Ptr156 ; |
1078 | 55 | ExponentBegin = Ptr; |
1079 | 55 | ++Ptr; |
1080 | 55 | NegativeExponent = *Ptr == '-'; |
1081 | 55 | if (NegativeExponent) ++Ptr19 ; |
1082 | 55 | |
1083 | 55 | unsigned NumExpDigits = SuffixBegin - Ptr; |
1084 | 55 | if (alwaysFitsInto64Bits(radix, NumExpDigits)) { |
1085 | 54 | llvm::StringRef ExpStr(Ptr, NumExpDigits); |
1086 | 54 | llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10); |
1087 | 54 | Exponent = ExpInt.getZExtValue(); |
1088 | 54 | } else { |
1089 | 1 | ExpOverflowOccurred = true; |
1090 | 1 | } |
1091 | 55 | |
1092 | 55 | if (NegativeExponent) BaseShift -= Exponent19 ; |
1093 | 36 | else BaseShift += Exponent; |
1094 | 55 | } |
1095 | 401 | |
1096 | 401 | // Number of bits needed for decimal literal is |
1097 | 401 | // ceil(NumDigits * log2(10)) Integral part |
1098 | 401 | // + Scale Fractional part |
1099 | 401 | // + ceil(Exponent * log2(10)) Exponent |
1100 | 401 | // -------------------------------------------------- |
1101 | 401 | // ceil((NumDigits + Exponent) * log2(10)) + Scale |
1102 | 401 | // |
1103 | 401 | // But for simplicity in handling integers, we can round up log2(10) to 4, |
1104 | 401 | // making: |
1105 | 401 | // 4 * (NumDigits + Exponent) + Scale |
1106 | 401 | // |
1107 | 401 | // Number of digits needed for hexadecimal literal is |
1108 | 401 | // 4 * NumDigits Integral part |
1109 | 401 | // + Scale Fractional part |
1110 | 401 | // + Exponent Exponent |
1111 | 401 | // -------------------------------------------------- |
1112 | 401 | // (4 * NumDigits) + Scale + Exponent |
1113 | 401 | uint64_t NumBitsNeeded; |
1114 | 401 | if (radix == 10) |
1115 | 365 | NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale; |
1116 | 36 | else |
1117 | 36 | NumBitsNeeded = 4 * NumDigits + Exponent + Scale; |
1118 | 401 | |
1119 | 401 | if (NumBitsNeeded > std::numeric_limits<unsigned>::max()) |
1120 | 0 | ExpOverflowOccurred = true; |
1121 | 401 | llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false); |
1122 | 401 | |
1123 | 401 | bool FoundDecimal = false; |
1124 | 401 | |
1125 | 401 | int64_t FractBaseShift = 0; |
1126 | 401 | const char *End = saw_exponent ? ExponentBegin55 : SuffixBegin346 ; |
1127 | 2.00k | for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr1.60k ) { |
1128 | 1.60k | if (*Ptr == '.') { |
1129 | 375 | FoundDecimal = true; |
1130 | 375 | continue; |
1131 | 375 | } |
1132 | 1.23k | |
1133 | 1.23k | // Normal reading of an integer |
1134 | 1.23k | unsigned C = llvm::hexDigitValue(*Ptr); |
1135 | 1.23k | assert(C < radix && "NumericLiteralParser ctor should have rejected this"); |
1136 | 1.23k | |
1137 | 1.23k | Val *= radix; |
1138 | 1.23k | Val += C; |
1139 | 1.23k | |
1140 | 1.23k | if (FoundDecimal) |
1141 | 784 | // Keep track of how much we will need to adjust this value by from the |
1142 | 784 | // number of digits past the radix point. |
1143 | 784 | --FractBaseShift; |
1144 | 1.23k | } |
1145 | 401 | |
1146 | 401 | // For a radix of 16, we will be multiplying by 2 instead of 16. |
1147 | 401 | if (radix == 16) FractBaseShift *= 436 ; |
1148 | 401 | BaseShift += FractBaseShift; |
1149 | 401 | |
1150 | 401 | Val <<= Scale; |
1151 | 401 | |
1152 | 401 | uint64_t Base = (radix == 16) ? 236 : 10365 ; |
1153 | 401 | if (BaseShift > 0) { |
1154 | 132 | for (int64_t i = 0; i < BaseShift; ++i124 ) { |
1155 | 124 | Val *= Base; |
1156 | 124 | } |
1157 | 393 | } else if (BaseShift < 0) { |
1158 | 1.45k | for (int64_t i = BaseShift; i < 0 && !Val.isNullValue()1.09k ; ++i1.07k ) |
1159 | 1.07k | Val = Val.udiv(Base); |
1160 | 378 | } |
1161 | 401 | |
1162 | 401 | bool IntOverflowOccurred = false; |
1163 | 401 | auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth()); |
1164 | 401 | if (Val.getBitWidth() > StoreVal.getBitWidth()) { |
1165 | 185 | IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth())); |
1166 | 185 | StoreVal = Val.trunc(StoreVal.getBitWidth()); |
1167 | 216 | } else if (Val.getBitWidth() < StoreVal.getBitWidth()) { |
1168 | 199 | IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal); |
1169 | 199 | StoreVal = Val.zext(StoreVal.getBitWidth()); |
1170 | 199 | } else { |
1171 | 17 | StoreVal = Val; |
1172 | 17 | } |
1173 | 401 | |
1174 | 401 | return IntOverflowOccurred || ExpOverflowOccurred374 ; |
1175 | 401 | } |
1176 | | |
1177 | | /// \verbatim |
1178 | | /// user-defined-character-literal: [C++11 lex.ext] |
1179 | | /// character-literal ud-suffix |
1180 | | /// ud-suffix: |
1181 | | /// identifier |
1182 | | /// character-literal: [C++11 lex.ccon] |
1183 | | /// ' c-char-sequence ' |
1184 | | /// u' c-char-sequence ' |
1185 | | /// U' c-char-sequence ' |
1186 | | /// L' c-char-sequence ' |
1187 | | /// u8' c-char-sequence ' [C++1z lex.ccon] |
1188 | | /// c-char-sequence: |
1189 | | /// c-char |
1190 | | /// c-char-sequence c-char |
1191 | | /// c-char: |
1192 | | /// any member of the source character set except the single-quote ', |
1193 | | /// backslash \, or new-line character |
1194 | | /// escape-sequence |
1195 | | /// universal-character-name |
1196 | | /// escape-sequence: |
1197 | | /// simple-escape-sequence |
1198 | | /// octal-escape-sequence |
1199 | | /// hexadecimal-escape-sequence |
1200 | | /// simple-escape-sequence: |
1201 | | /// one of \' \" \? \\ \a \b \f \n \r \t \v |
1202 | | /// octal-escape-sequence: |
1203 | | /// \ octal-digit |
1204 | | /// \ octal-digit octal-digit |
1205 | | /// \ octal-digit octal-digit octal-digit |
1206 | | /// hexadecimal-escape-sequence: |
1207 | | /// \x hexadecimal-digit |
1208 | | /// hexadecimal-escape-sequence hexadecimal-digit |
1209 | | /// universal-character-name: [C++11 lex.charset] |
1210 | | /// \u hex-quad |
1211 | | /// \U hex-quad hex-quad |
1212 | | /// hex-quad: |
1213 | | /// hex-digit hex-digit hex-digit hex-digit |
1214 | | /// \endverbatim |
1215 | | /// |
1216 | | CharLiteralParser::CharLiteralParser(const char *begin, const char *end, |
1217 | | SourceLocation Loc, Preprocessor &PP, |
1218 | 78.1k | tok::TokenKind kind) { |
1219 | 78.1k | // At this point we know that the character matches the regex "(L|u|U)?'.*'". |
1220 | 78.1k | HadError = false; |
1221 | 78.1k | |
1222 | 78.1k | Kind = kind; |
1223 | 78.1k | |
1224 | 78.1k | const char *TokBegin = begin; |
1225 | 78.1k | |
1226 | 78.1k | // Skip over wide character determinant. |
1227 | 78.1k | if (Kind != tok::char_constant) |
1228 | 1.31k | ++begin; |
1229 | 78.1k | if (Kind == tok::utf8_char_constant) |
1230 | 24 | ++begin; |
1231 | 78.1k | |
1232 | 78.1k | // Skip over the entry quote. |
1233 | 78.1k | assert(begin[0] == '\'' && "Invalid token lexed"); |
1234 | 78.1k | ++begin; |
1235 | 78.1k | |
1236 | 78.1k | // Remove an optional ud-suffix. |
1237 | 78.1k | if (end[-1] != '\'') { |
1238 | 44 | const char *UDSuffixEnd = end; |
1239 | 172 | do { |
1240 | 172 | --end; |
1241 | 172 | } while (end[-1] != '\''); |
1242 | 44 | // FIXME: Don't bother with this if !tok.hasUCN(). |
1243 | 44 | expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); |
1244 | 44 | UDSuffixOffset = end - TokBegin; |
1245 | 44 | } |
1246 | 78.1k | |
1247 | 78.1k | // Trim the ending quote. |
1248 | 78.1k | assert(end != begin && "Invalid token lexed"); |
1249 | 78.1k | --end; |
1250 | 78.1k | |
1251 | 78.1k | // FIXME: The "Value" is an uint64_t so we can handle char literals of |
1252 | 78.1k | // up to 64-bits. |
1253 | 78.1k | // FIXME: This extensively assumes that 'char' is 8-bits. |
1254 | 78.1k | assert(PP.getTargetInfo().getCharWidth() == 8 && |
1255 | 78.1k | "Assumes char is 8 bits"); |
1256 | 78.1k | assert(PP.getTargetInfo().getIntWidth() <= 64 && |
1257 | 78.1k | (PP.getTargetInfo().getIntWidth() & 7) == 0 && |
1258 | 78.1k | "Assumes sizeof(int) on target is <= 64 and a multiple of char"); |
1259 | 78.1k | assert(PP.getTargetInfo().getWCharWidth() <= 64 && |
1260 | 78.1k | "Assumes sizeof(wchar) on target is <= 64"); |
1261 | 78.1k | |
1262 | 78.1k | SmallVector<uint32_t, 4> codepoint_buffer; |
1263 | 78.1k | codepoint_buffer.resize(end - begin); |
1264 | 78.1k | uint32_t *buffer_begin = &codepoint_buffer.front(); |
1265 | 78.1k | uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); |
1266 | 78.1k | |
1267 | 78.1k | // Unicode escapes representing characters that cannot be correctly |
1268 | 78.1k | // represented in a single code unit are disallowed in character literals |
1269 | 78.1k | // by this implementation. |
1270 | 78.1k | uint32_t largest_character_for_kind; |
1271 | 78.1k | if (tok::wide_char_constant == Kind) { |
1272 | 1.14k | largest_character_for_kind = |
1273 | 1.14k | 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); |
1274 | 76.9k | } else if (tok::utf8_char_constant == Kind) { |
1275 | 24 | largest_character_for_kind = 0x7F; |
1276 | 76.9k | } else if (tok::utf16_char_constant == Kind) { |
1277 | 75 | largest_character_for_kind = 0xFFFF; |
1278 | 76.8k | } else if (tok::utf32_char_constant == Kind) { |
1279 | 66 | largest_character_for_kind = 0x10FFFF; |
1280 | 76.8k | } else { |
1281 | 76.8k | largest_character_for_kind = 0x7Fu; |
1282 | 76.8k | } |
1283 | 78.1k | |
1284 | 156k | while (begin != end) { |
1285 | 78.1k | // Is this a span of non-escape characters? |
1286 | 78.1k | if (begin[0] != '\\') { |
1287 | 59.1k | char const *start = begin; |
1288 | 59.6k | do { |
1289 | 59.6k | ++begin; |
1290 | 59.6k | } while (begin != end && *begin != '\\'561 ); |
1291 | 59.1k | |
1292 | 59.1k | char const *tmp_in_start = start; |
1293 | 59.1k | uint32_t *tmp_out_start = buffer_begin; |
1294 | 59.1k | llvm::ConversionResult res = |
1295 | 59.1k | llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start), |
1296 | 59.1k | reinterpret_cast<llvm::UTF8 const *>(begin), |
1297 | 59.1k | &buffer_begin, buffer_end, llvm::strictConversion); |
1298 | 59.1k | if (res != llvm::conversionOK) { |
1299 | 7 | // If we see bad encoding for unprefixed character literals, warn and |
1300 | 7 | // simply copy the byte values, for compatibility with gcc and |
1301 | 7 | // older versions of clang. |
1302 | 7 | bool NoErrorOnBadEncoding = isAscii(); |
1303 | 7 | unsigned Msg = diag::err_bad_character_encoding; |
1304 | 7 | if (NoErrorOnBadEncoding) |
1305 | 4 | Msg = diag::warn_bad_character_encoding; |
1306 | 7 | PP.Diag(Loc, Msg); |
1307 | 7 | if (NoErrorOnBadEncoding) { |
1308 | 4 | start = tmp_in_start; |
1309 | 4 | buffer_begin = tmp_out_start; |
1310 | 9 | for (; start != begin; ++start, ++buffer_begin5 ) |
1311 | 5 | *buffer_begin = static_cast<uint8_t>(*start); |
1312 | 4 | } else { |
1313 | 3 | HadError = true; |
1314 | 3 | } |
1315 | 59.1k | } else { |
1316 | 118k | for (; tmp_out_start < buffer_begin; ++tmp_out_start59.6k ) { |
1317 | 59.6k | if (*tmp_out_start > largest_character_for_kind) { |
1318 | 13 | HadError = true; |
1319 | 13 | PP.Diag(Loc, diag::err_character_too_large); |
1320 | 13 | } |
1321 | 59.6k | } |
1322 | 59.1k | } |
1323 | 59.1k | |
1324 | 59.1k | continue; |
1325 | 59.1k | } |
1326 | 19.0k | // Is this a Universal Character Name escape? |
1327 | 19.0k | if (begin[1] == 'u' || begin[1] == 'U'18.9k ) { |
1328 | 105 | unsigned short UcnLen = 0; |
1329 | 105 | if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, |
1330 | 105 | FullSourceLoc(Loc, PP.getSourceManager()), |
1331 | 105 | &PP.getDiagnostics(), PP.getLangOpts(), true)) { |
1332 | 29 | HadError = true; |
1333 | 76 | } else if (*buffer_begin > largest_character_for_kind) { |
1334 | 8 | HadError = true; |
1335 | 8 | PP.Diag(Loc, diag::err_character_too_large); |
1336 | 8 | } |
1337 | 105 | |
1338 | 105 | ++buffer_begin; |
1339 | 105 | continue; |
1340 | 105 | } |
1341 | 18.9k | unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); |
1342 | 18.9k | uint64_t result = |
1343 | 18.9k | ProcessCharEscape(TokBegin, begin, end, HadError, |
1344 | 18.9k | FullSourceLoc(Loc,PP.getSourceManager()), |
1345 | 18.9k | CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); |
1346 | 18.9k | *buffer_begin++ = result; |
1347 | 18.9k | } |
1348 | 78.1k | |
1349 | 78.1k | unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); |
1350 | 78.1k | |
1351 | 78.1k | if (NumCharsSoFar > 1) { |
1352 | 175 | if (isWide()) |
1353 | 17 | PP.Diag(Loc, diag::warn_extraneous_char_constant); |
1354 | 158 | else if (isAscii() && NumCharsSoFar == 4153 ) |
1355 | 127 | PP.Diag(Loc, diag::ext_four_char_character_literal); |
1356 | 31 | else if (isAscii()) |
1357 | 26 | PP.Diag(Loc, diag::ext_multichar_character_literal); |
1358 | 5 | else |
1359 | 5 | PP.Diag(Loc, diag::err_multichar_utf_character_literal); |
1360 | 175 | IsMultiChar = true; |
1361 | 77.9k | } else { |
1362 | 77.9k | IsMultiChar = false; |
1363 | 77.9k | } |
1364 | 78.1k | |
1365 | 78.1k | llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); |
1366 | 78.1k | |
1367 | 78.1k | // Narrow character literals act as though their value is concatenated |
1368 | 78.1k | // in this implementation, but warn on overflow. |
1369 | 78.1k | bool multi_char_too_long = false; |
1370 | 78.1k | if (isAscii() && isMultiChar()76.8k ) { |
1371 | 153 | LitVal = 0; |
1372 | 797 | for (size_t i = 0; i < NumCharsSoFar; ++i644 ) { |
1373 | 644 | // check for enough leading zeros to shift into |
1374 | 644 | multi_char_too_long |= (LitVal.countLeadingZeros() < 8); |
1375 | 644 | LitVal <<= 8; |
1376 | 644 | LitVal = LitVal + (codepoint_buffer[i] & 0xFF); |
1377 | 644 | } |
1378 | 77.9k | } else if (NumCharsSoFar > 0) { |
1379 | 77.9k | // otherwise just take the last character |
1380 | 77.9k | LitVal = buffer_begin[-1]; |
1381 | 77.9k | } |
1382 | 78.1k | |
1383 | 78.1k | if (!HadError && multi_char_too_long78.0k ) { |
1384 | 5 | PP.Diag(Loc, diag::warn_char_constant_too_large); |
1385 | 5 | } |
1386 | 78.1k | |
1387 | 78.1k | // Transfer the value from APInt to uint64_t |
1388 | 78.1k | Value = LitVal.getZExtValue(); |
1389 | 78.1k | |
1390 | 78.1k | // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") |
1391 | 78.1k | // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple |
1392 | 78.1k | // character constants are not sign extended in the this implementation: |
1393 | 78.1k | // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. |
1394 | 78.1k | if (isAscii() && NumCharsSoFar == 176.8k && (Value & 128)76.6k && |
1395 | 78.1k | PP.getLangOpts().CharIsSigned165 ) |
1396 | 143 | Value = (signed char)Value; |
1397 | 78.1k | } |
1398 | | |
1399 | | /// \verbatim |
1400 | | /// string-literal: [C++0x lex.string] |
1401 | | /// encoding-prefix " [s-char-sequence] " |
1402 | | /// encoding-prefix R raw-string |
1403 | | /// encoding-prefix: |
1404 | | /// u8 |
1405 | | /// u |
1406 | | /// U |
1407 | | /// L |
1408 | | /// s-char-sequence: |
1409 | | /// s-char |
1410 | | /// s-char-sequence s-char |
1411 | | /// s-char: |
1412 | | /// any member of the source character set except the double-quote ", |
1413 | | /// backslash \, or new-line character |
1414 | | /// escape-sequence |
1415 | | /// universal-character-name |
1416 | | /// raw-string: |
1417 | | /// " d-char-sequence ( r-char-sequence ) d-char-sequence " |
1418 | | /// r-char-sequence: |
1419 | | /// r-char |
1420 | | /// r-char-sequence r-char |
1421 | | /// r-char: |
1422 | | /// any member of the source character set, except a right parenthesis ) |
1423 | | /// followed by the initial d-char-sequence (which may be empty) |
1424 | | /// followed by a double quote ". |
1425 | | /// d-char-sequence: |
1426 | | /// d-char |
1427 | | /// d-char-sequence d-char |
1428 | | /// d-char: |
1429 | | /// any member of the basic source character set except: |
1430 | | /// space, the left parenthesis (, the right parenthesis ), |
1431 | | /// the backslash \, and the control characters representing horizontal |
1432 | | /// tab, vertical tab, form feed, and newline. |
1433 | | /// escape-sequence: [C++0x lex.ccon] |
1434 | | /// simple-escape-sequence |
1435 | | /// octal-escape-sequence |
1436 | | /// hexadecimal-escape-sequence |
1437 | | /// simple-escape-sequence: |
1438 | | /// one of \' \" \? \\ \a \b \f \n \r \t \v |
1439 | | /// octal-escape-sequence: |
1440 | | /// \ octal-digit |
1441 | | /// \ octal-digit octal-digit |
1442 | | /// \ octal-digit octal-digit octal-digit |
1443 | | /// hexadecimal-escape-sequence: |
1444 | | /// \x hexadecimal-digit |
1445 | | /// hexadecimal-escape-sequence hexadecimal-digit |
1446 | | /// universal-character-name: |
1447 | | /// \u hex-quad |
1448 | | /// \U hex-quad hex-quad |
1449 | | /// hex-quad: |
1450 | | /// hex-digit hex-digit hex-digit hex-digit |
1451 | | /// \endverbatim |
1452 | | /// |
1453 | | StringLiteralParser:: |
1454 | | StringLiteralParser(ArrayRef<Token> StringToks, |
1455 | | Preprocessor &PP, bool Complain) |
1456 | | : SM(PP.getSourceManager()), Features(PP.getLangOpts()), |
1457 | | Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr), |
1458 | | MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), |
1459 | 3.11M | ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { |
1460 | 3.11M | init(StringToks); |
1461 | 3.11M | } |
1462 | | |
1463 | 3.26M | void StringLiteralParser::init(ArrayRef<Token> StringToks){ |
1464 | 3.26M | // The literal token may have come from an invalid source location (e.g. due |
1465 | 3.26M | // to a PCH error), in which case the token length will be 0. |
1466 | 3.26M | if (StringToks.empty() || StringToks[0].getLength() < 2) |
1467 | 0 | return DiagnoseLexingError(SourceLocation()); |
1468 | 3.26M | |
1469 | 3.26M | // Scan all of the string portions, remember the max individual token length, |
1470 | 3.26M | // computing a bound on the concatenated string length, and see whether any |
1471 | 3.26M | // piece is a wide-string. If any of the string portions is a wide-string |
1472 | 3.26M | // literal, the result is a wide-string literal [C99 6.4.5p4]. |
1473 | 3.26M | assert(!StringToks.empty() && "expected at least one token"); |
1474 | 3.26M | MaxTokenLength = StringToks[0].getLength(); |
1475 | 3.26M | assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); |
1476 | 3.26M | SizeBound = StringToks[0].getLength()-2; // -2 for "". |
1477 | 3.26M | Kind = StringToks[0].getKind(); |
1478 | 3.26M | |
1479 | 3.26M | hadError = false; |
1480 | 3.26M | |
1481 | 3.26M | // Implement Translation Phase #6: concatenation of string literals |
1482 | 3.26M | /// (C99 5.1.1.2p1). The common case is only one string fragment. |
1483 | 4.10M | for (unsigned i = 1; i != StringToks.size(); ++i839k ) { |
1484 | 839k | if (StringToks[i].getLength() < 2) |
1485 | 0 | return DiagnoseLexingError(StringToks[i].getLocation()); |
1486 | 839k | |
1487 | 839k | // The string could be shorter than this if it needs cleaning, but this is a |
1488 | 839k | // reasonable bound, which is all we need. |
1489 | 839k | assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); |
1490 | 839k | SizeBound += StringToks[i].getLength()-2; // -2 for "". |
1491 | 839k | |
1492 | 839k | // Remember maximum string piece length. |
1493 | 839k | if (StringToks[i].getLength() > MaxTokenLength) |
1494 | 258k | MaxTokenLength = StringToks[i].getLength(); |
1495 | 839k | |
1496 | 839k | // Remember if we see any wide or utf-8/16/32 strings. |
1497 | 839k | // Also check for illegal concatenations. |
1498 | 839k | if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)79 ) { |
1499 | 53 | if (isAscii()) { |
1500 | 17 | Kind = StringToks[i].getKind(); |
1501 | 36 | } else { |
1502 | 36 | if (Diags) |
1503 | 36 | Diags->Report(StringToks[i].getLocation(), |
1504 | 36 | diag::err_unsupported_string_concat); |
1505 | 36 | hadError = true; |
1506 | 36 | } |
1507 | 53 | } |
1508 | 839k | } |
1509 | 3.26M | |
1510 | 3.26M | // Include space for the null terminator. |
1511 | 3.26M | ++SizeBound; |
1512 | 3.26M | |
1513 | 3.26M | // TODO: K&R warning: "traditional C rejects string constant concatenation" |
1514 | 3.26M | |
1515 | 3.26M | // Get the width in bytes of char/wchar_t/char16_t/char32_t |
1516 | 3.26M | CharByteWidth = getCharWidth(Kind, Target); |
1517 | 3.26M | assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); |
1518 | 3.26M | CharByteWidth /= 8; |
1519 | 3.26M | |
1520 | 3.26M | // The output buffer size needs to be large enough to hold wide characters. |
1521 | 3.26M | // This is a worst-case assumption which basically corresponds to L"" "long". |
1522 | 3.26M | SizeBound *= CharByteWidth; |
1523 | 3.26M | |
1524 | 3.26M | // Size the temporary buffer to hold the result string data. |
1525 | 3.26M | ResultBuf.resize(SizeBound); |
1526 | 3.26M | |
1527 | 3.26M | // Likewise, but for each string piece. |
1528 | 3.26M | SmallString<512> TokenBuf; |
1529 | 3.26M | TokenBuf.resize(MaxTokenLength); |
1530 | 3.26M | |
1531 | 3.26M | // Loop over all the strings, getting their spelling, and expanding them to |
1532 | 3.26M | // wide strings as appropriate. |
1533 | 3.26M | ResultPtr = &ResultBuf[0]; // Next byte to fill in. |
1534 | 3.26M | |
1535 | 3.26M | Pascal = false; |
1536 | 3.26M | |
1537 | 3.26M | SourceLocation UDSuffixTokLoc; |
1538 | 3.26M | |
1539 | 7.36M | for (unsigned i = 0, e = StringToks.size(); i != e; ++i4.10M ) { |
1540 | 4.10M | const char *ThisTokBuf = &TokenBuf[0]; |
1541 | 4.10M | // Get the spelling of the token, which eliminates trigraphs, etc. We know |
1542 | 4.10M | // that ThisTokBuf points to a buffer that is big enough for the whole token |
1543 | 4.10M | // and 'spelled' tokens can only shrink. |
1544 | 4.10M | bool StringInvalid = false; |
1545 | 4.10M | unsigned ThisTokLen = |
1546 | 4.10M | Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, |
1547 | 4.10M | &StringInvalid); |
1548 | 4.10M | if (StringInvalid) |
1549 | 0 | return DiagnoseLexingError(StringToks[i].getLocation()); |
1550 | 4.10M | |
1551 | 4.10M | const char *ThisTokBegin = ThisTokBuf; |
1552 | 4.10M | const char *ThisTokEnd = ThisTokBuf+ThisTokLen; |
1553 | 4.10M | |
1554 | 4.10M | // Remove an optional ud-suffix. |
1555 | 4.10M | if (ThisTokEnd[-1] != '"') { |
1556 | 1.34k | const char *UDSuffixEnd = ThisTokEnd; |
1557 | 2.89k | do { |
1558 | 2.89k | --ThisTokEnd; |
1559 | 2.89k | } while (ThisTokEnd[-1] != '"'); |
1560 | 1.34k | |
1561 | 1.34k | StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); |
1562 | 1.34k | |
1563 | 1.34k | if (UDSuffixBuf.empty()) { |
1564 | 1.32k | if (StringToks[i].hasUCN()) |
1565 | 8 | expandUCNs(UDSuffixBuf, UDSuffix); |
1566 | 1.31k | else |
1567 | 1.31k | UDSuffixBuf.assign(UDSuffix); |
1568 | 1.32k | UDSuffixToken = i; |
1569 | 1.32k | UDSuffixOffset = ThisTokEnd - ThisTokBuf; |
1570 | 1.32k | UDSuffixTokLoc = StringToks[i].getLocation(); |
1571 | 1.32k | } else { |
1572 | 18 | SmallString<32> ExpandedUDSuffix; |
1573 | 18 | if (StringToks[i].hasUCN()) { |
1574 | 9 | expandUCNs(ExpandedUDSuffix, UDSuffix); |
1575 | 9 | UDSuffix = ExpandedUDSuffix; |
1576 | 9 | } |
1577 | 18 | |
1578 | 18 | // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the |
1579 | 18 | // result of a concatenation involving at least one user-defined-string- |
1580 | 18 | // literal, all the participating user-defined-string-literals shall |
1581 | 18 | // have the same ud-suffix. |
1582 | 18 | if (UDSuffixBuf != UDSuffix) { |
1583 | 6 | if (Diags) { |
1584 | 6 | SourceLocation TokLoc = StringToks[i].getLocation(); |
1585 | 6 | Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) |
1586 | 6 | << UDSuffixBuf << UDSuffix |
1587 | 6 | << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc) |
1588 | 6 | << SourceRange(TokLoc, TokLoc); |
1589 | 6 | } |
1590 | 6 | hadError = true; |
1591 | 6 | } |
1592 | 18 | } |
1593 | 1.34k | } |
1594 | 4.10M | |
1595 | 4.10M | // Strip the end quote. |
1596 | 4.10M | --ThisTokEnd; |
1597 | 4.10M | |
1598 | 4.10M | // TODO: Input character set mapping support. |
1599 | 4.10M | |
1600 | 4.10M | // Skip marker for wide or unicode strings. |
1601 | 4.10M | if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u'4.10M || ThisTokBuf[0] == 'U'4.10M ) { |
1602 | 1.64k | ++ThisTokBuf; |
1603 | 1.64k | // Skip 8 of u8 marker for utf8 strings. |
1604 | 1.64k | if (ThisTokBuf[0] == '8') |
1605 | 112 | ++ThisTokBuf; |
1606 | 1.64k | } |
1607 | 4.10M | |
1608 | 4.10M | // Check for raw string |
1609 | 4.10M | if (ThisTokBuf[0] == 'R') { |
1610 | 101 | ThisTokBuf += 2; // skip R" |
1611 | 101 | |
1612 | 101 | const char *Prefix = ThisTokBuf; |
1613 | 231 | while (ThisTokBuf[0] != '(') |
1614 | 130 | ++ThisTokBuf; |
1615 | 101 | ++ThisTokBuf; // skip '(' |
1616 | 101 | |
1617 | 101 | // Remove same number of characters from the end |
1618 | 101 | ThisTokEnd -= ThisTokBuf - Prefix; |
1619 | 101 | assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); |
1620 | 101 | |
1621 | 101 | // C++14 [lex.string]p4: A source-file new-line in a raw string literal |
1622 | 101 | // results in a new-line in the resulting execution string-literal. |
1623 | 101 | StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf); |
1624 | 199 | while (!RemainingTokenSpan.empty()) { |
1625 | 98 | // Split the string literal on \r\n boundaries. |
1626 | 98 | size_t CRLFPos = RemainingTokenSpan.find("\r\n"); |
1627 | 98 | StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos); |
1628 | 98 | StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); |
1629 | 98 | |
1630 | 98 | // Copy everything before the \r\n sequence into the string literal. |
1631 | 98 | if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) |
1632 | 6 | hadError = true; |
1633 | 98 | |
1634 | 98 | // Point into the \n inside the \r\n sequence and operate on the |
1635 | 98 | // remaining portion of the literal. |
1636 | 98 | RemainingTokenSpan = AfterCRLF.substr(1); |
1637 | 98 | } |
1638 | 4.10M | } else { |
1639 | 4.10M | if (ThisTokBuf[0] != '"') { |
1640 | 0 | // The file may have come from PCH and then changed after loading the |
1641 | 0 | // PCH; Fail gracefully. |
1642 | 0 | return DiagnoseLexingError(StringToks[i].getLocation()); |
1643 | 0 | } |
1644 | 4.10M | ++ThisTokBuf; // skip " |
1645 | 4.10M | |
1646 | 4.10M | // Check if this is a pascal string |
1647 | 4.10M | if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd96 && |
1648 | 4.10M | ThisTokBuf[0] == '\\'96 && ThisTokBuf[1] == 'p'15 ) { |
1649 | 15 | |
1650 | 15 | // If the \p sequence is found in the first token, we have a pascal string |
1651 | 15 | // Otherwise, if we already have a pascal string, ignore the first \p |
1652 | 15 | if (i == 0) { |
1653 | 15 | ++ThisTokBuf; |
1654 | 15 | Pascal = true; |
1655 | 15 | } else if (0 Pascal0 ) |
1656 | 0 | ThisTokBuf += 2; |
1657 | 15 | } |
1658 | 4.10M | |
1659 | 8.30M | while (ThisTokBuf != ThisTokEnd) { |
1660 | 4.19M | // Is this a span of non-escape characters? |
1661 | 4.19M | if (ThisTokBuf[0] != '\\') { |
1662 | 4.07M | const char *InStart = ThisTokBuf; |
1663 | 49.4M | do { |
1664 | 49.4M | ++ThisTokBuf; |
1665 | 49.4M | } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'45.4M ); |
1666 | 4.07M | |
1667 | 4.07M | // Copy the character span over. |
1668 | 4.07M | if (CopyStringFragment(StringToks[i], ThisTokBegin, |
1669 | 4.07M | StringRef(InStart, ThisTokBuf - InStart))) |
1670 | 8 | hadError = true; |
1671 | 4.07M | continue; |
1672 | 4.07M | } |
1673 | 119k | // Is this a Universal Character Name escape? |
1674 | 119k | if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U'119k ) { |
1675 | 265 | EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, |
1676 | 265 | ResultPtr, hadError, |
1677 | 265 | FullSourceLoc(StringToks[i].getLocation(), SM), |
1678 | 265 | CharByteWidth, Diags, Features); |
1679 | 265 | continue; |
1680 | 265 | } |
1681 | 119k | // Otherwise, this is a non-UCN escape character. Process it. |
1682 | 119k | unsigned ResultChar = |
1683 | 119k | ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, |
1684 | 119k | FullSourceLoc(StringToks[i].getLocation(), SM), |
1685 | 119k | CharByteWidth*8, Diags, Features); |
1686 | 119k | |
1687 | 119k | if (CharByteWidth == 4) { |
1688 | 251 | // FIXME: Make the type of the result buffer correct instead of |
1689 | 251 | // using reinterpret_cast. |
1690 | 251 | llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr); |
1691 | 251 | *ResultWidePtr = ResultChar; |
1692 | 251 | ResultPtr += 4; |
1693 | 118k | } else if (CharByteWidth == 2) { |
1694 | 24 | // FIXME: Make the type of the result buffer correct instead of |
1695 | 24 | // using reinterpret_cast. |
1696 | 24 | llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr); |
1697 | 24 | *ResultWidePtr = ResultChar & 0xFFFF; |
1698 | 24 | ResultPtr += 2; |
1699 | 118k | } else { |
1700 | 118k | assert(CharByteWidth == 1 && "Unexpected char width"); |
1701 | 118k | *ResultPtr++ = ResultChar & 0xFF; |
1702 | 118k | } |
1703 | 119k | } |
1704 | 4.10M | } |
1705 | 4.10M | } |
1706 | 3.26M | |
1707 | 3.26M | if (Pascal) { |
1708 | 15 | if (CharByteWidth == 4) { |
1709 | 1 | // FIXME: Make the type of the result buffer correct instead of |
1710 | 1 | // using reinterpret_cast. |
1711 | 1 | llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data()); |
1712 | 1 | ResultWidePtr[0] = GetNumStringChars() - 1; |
1713 | 14 | } else if (CharByteWidth == 2) { |
1714 | 3 | // FIXME: Make the type of the result buffer correct instead of |
1715 | 3 | // using reinterpret_cast. |
1716 | 3 | llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data()); |
1717 | 3 | ResultWidePtr[0] = GetNumStringChars() - 1; |
1718 | 11 | } else { |
1719 | 11 | assert(CharByteWidth == 1 && "Unexpected char width"); |
1720 | 11 | ResultBuf[0] = GetNumStringChars() - 1; |
1721 | 11 | } |
1722 | 15 | |
1723 | 15 | // Verify that pascal strings aren't too large. |
1724 | 15 | if (GetStringLength() > 256) { |
1725 | 0 | if (Diags) |
1726 | 0 | Diags->Report(StringToks.front().getLocation(), |
1727 | 0 | diag::err_pascal_string_too_long) |
1728 | 0 | << SourceRange(StringToks.front().getLocation(), |
1729 | 0 | StringToks.back().getLocation()); |
1730 | 0 | hadError = true; |
1731 | 0 | return; |
1732 | 0 | } |
1733 | 3.26M | } else if (Diags) { |
1734 | 3.11M | // Complain if this string literal has too many characters. |
1735 | 3.11M | unsigned MaxChars = Features.CPlusPlus? 655362.17M : Features.C99 940k ? 4095935k : 5095.50k ; |
1736 | 3.11M | |
1737 | 3.11M | if (GetNumStringChars() > MaxChars) |
1738 | 5 | Diags->Report(StringToks.front().getLocation(), |
1739 | 5 | diag::ext_string_too_long) |
1740 | 5 | << GetNumStringChars() << MaxChars |
1741 | 5 | << (Features.CPlusPlus ? 21 : Features.C99 4 ? 13 : 01 ) |
1742 | 5 | << SourceRange(StringToks.front().getLocation(), |
1743 | 5 | StringToks.back().getLocation()); |
1744 | 3.11M | } |
1745 | 3.26M | } |
1746 | | |
1747 | 133 | static const char *resyncUTF8(const char *Err, const char *End) { |
1748 | 133 | if (Err == End) |
1749 | 0 | return End; |
1750 | 133 | End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err); |
1751 | 133 | while (++Err != End && (*Err & 0xC0) == 0x80109 ) |
1752 | 0 | ; |
1753 | 133 | return Err; |
1754 | 133 | } |
1755 | | |
1756 | | /// This function copies from Fragment, which is a sequence of bytes |
1757 | | /// within Tok's contents (which begin at TokBegin) into ResultPtr. |
1758 | | /// Performs widening for multi-byte characters. |
1759 | | bool StringLiteralParser::CopyStringFragment(const Token &Tok, |
1760 | | const char *TokBegin, |
1761 | 4.07M | StringRef Fragment) { |
1762 | 4.07M | const llvm::UTF8 *ErrorPtrTmp; |
1763 | 4.07M | if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) |
1764 | 4.07M | return false; |
1765 | 25 | |
1766 | 25 | // If we see bad encoding for unprefixed string literals, warn and |
1767 | 25 | // simply copy the byte values, for compatibility with gcc and older |
1768 | 25 | // versions of clang. |
1769 | 25 | bool NoErrorOnBadEncoding = isAscii(); |
1770 | 25 | if (NoErrorOnBadEncoding) { |
1771 | 10 | memcpy(ResultPtr, Fragment.data(), Fragment.size()); |
1772 | 10 | ResultPtr += Fragment.size(); |
1773 | 10 | } |
1774 | 25 | |
1775 | 25 | if (Diags) { |
1776 | 24 | const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); |
1777 | 24 | |
1778 | 24 | FullSourceLoc SourceLoc(Tok.getLocation(), SM); |
1779 | 24 | const DiagnosticBuilder &Builder = |
1780 | 24 | Diag(Diags, Features, SourceLoc, TokBegin, |
1781 | 24 | ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()), |
1782 | 24 | NoErrorOnBadEncoding ? diag::warn_bad_string_encoding10 |
1783 | 24 | : diag::err_bad_string_encoding14 ); |
1784 | 24 | |
1785 | 24 | const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end()); |
1786 | 24 | StringRef NextFragment(NextStart, Fragment.end()-NextStart); |
1787 | 24 | |
1788 | 24 | // Decode into a dummy buffer. |
1789 | 24 | SmallString<512> Dummy; |
1790 | 24 | Dummy.reserve(Fragment.size() * CharByteWidth); |
1791 | 24 | char *Ptr = Dummy.data(); |
1792 | 24 | |
1793 | 109 | while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) { |
1794 | 85 | const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); |
1795 | 85 | NextStart = resyncUTF8(ErrorPtr, Fragment.end()); |
1796 | 85 | Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin, |
1797 | 85 | ErrorPtr, NextStart); |
1798 | 85 | NextFragment = StringRef(NextStart, Fragment.end()-NextStart); |
1799 | 85 | } |
1800 | 24 | } |
1801 | 25 | return !NoErrorOnBadEncoding; |
1802 | 25 | } |
1803 | | |
1804 | 0 | void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { |
1805 | 0 | hadError = true; |
1806 | 0 | if (Diags) |
1807 | 0 | Diags->Report(Loc, diag::err_lexing_string); |
1808 | 0 | } |
1809 | | |
1810 | | /// getOffsetOfStringByte - This function returns the offset of the |
1811 | | /// specified byte of the string data represented by Token. This handles |
1812 | | /// advancing over escape sequences in the string. |
1813 | | unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, |
1814 | 40.3k | unsigned ByteNo) const { |
1815 | 40.3k | // Get the spelling of the token. |
1816 | 40.3k | SmallString<32> SpellingBuffer; |
1817 | 40.3k | SpellingBuffer.resize(Tok.getLength()); |
1818 | 40.3k | |
1819 | 40.3k | bool StringInvalid = false; |
1820 | 40.3k | const char *SpellingPtr = &SpellingBuffer[0]; |
1821 | 40.3k | unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, |
1822 | 40.3k | &StringInvalid); |
1823 | 40.3k | if (StringInvalid) |
1824 | 0 | return 0; |
1825 | 40.3k | |
1826 | 40.3k | const char *SpellingStart = SpellingPtr; |
1827 | 40.3k | const char *SpellingEnd = SpellingPtr+TokLen; |
1828 | 40.3k | |
1829 | 40.3k | // Handle UTF-8 strings just like narrow strings. |
1830 | 40.3k | if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8'3 ) |
1831 | 3 | SpellingPtr += 2; |
1832 | 40.3k | |
1833 | 40.3k | assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && |
1834 | 40.3k | SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); |
1835 | 40.3k | |
1836 | 40.3k | // For raw string literals, this is easy. |
1837 | 40.3k | if (SpellingPtr[0] == 'R') { |
1838 | 6 | assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); |
1839 | 6 | // Skip 'R"'. |
1840 | 6 | SpellingPtr += 2; |
1841 | 35 | while (*SpellingPtr != '(') { |
1842 | 29 | ++SpellingPtr; |
1843 | 29 | assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); |
1844 | 29 | } |
1845 | 6 | // Skip '('. |
1846 | 6 | ++SpellingPtr; |
1847 | 6 | return SpellingPtr - SpellingStart + ByteNo; |
1848 | 6 | } |
1849 | 40.3k | |
1850 | 40.3k | // Skip over the leading quote |
1851 | 40.3k | assert(SpellingPtr[0] == '"' && "Should be a string literal!"); |
1852 | 40.3k | ++SpellingPtr; |
1853 | 40.3k | |
1854 | 40.3k | // Skip over bytes until we find the offset we're looking for. |
1855 | 432k | while (ByteNo) { |
1856 | 392k | assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); |
1857 | 392k | |
1858 | 392k | // Step over non-escapes simply. |
1859 | 392k | if (*SpellingPtr != '\\') { |
1860 | 392k | ++SpellingPtr; |
1861 | 392k | --ByteNo; |
1862 | 392k | continue; |
1863 | 392k | } |
1864 | 467 | |
1865 | 467 | // Otherwise, this is an escape character. Advance over it. |
1866 | 467 | bool HadError = false; |
1867 | 467 | if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U'464 ) { |
1868 | 6 | const char *EscapePtr = SpellingPtr; |
1869 | 6 | unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, |
1870 | 6 | 1, Features, HadError); |
1871 | 6 | if (Len > ByteNo) { |
1872 | 6 | // ByteNo is somewhere within the escape sequence. |
1873 | 6 | SpellingPtr = EscapePtr; |
1874 | 6 | break; |
1875 | 6 | } |
1876 | 0 | ByteNo -= Len; |
1877 | 461 | } else { |
1878 | 461 | ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, |
1879 | 461 | FullSourceLoc(Tok.getLocation(), SM), |
1880 | 461 | CharByteWidth*8, Diags, Features); |
1881 | 461 | --ByteNo; |
1882 | 461 | } |
1883 | 467 | assert(!HadError && "This method isn't valid on erroneous strings"); |
1884 | 461 | } |
1885 | 40.3k | |
1886 | 40.3k | return SpellingPtr-SpellingStart; |
1887 | 40.3k | } |
1888 | | |
1889 | | /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved |
1890 | | /// suffixes as ud-suffixes, because the diagnostic experience is better if we |
1891 | | /// treat it as an invalid suffix. |
1892 | | bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, |
1893 | 1.40k | StringRef Suffix) { |
1894 | 1.40k | return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) || |
1895 | 1.40k | Suffix == "sv"206 ; |
1896 | 1.40k | } |