Coverage Report

Created: 2023-09-12 09:32

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/LiteralSupport.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the NumericLiteralParser, CharLiteralParser, and
10
// StringLiteralParser interfaces.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "clang/Lex/LiteralSupport.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TargetInfo.h"
19
#include "clang/Lex/LexDiagnostic.h"
20
#include "clang/Lex/Lexer.h"
21
#include "clang/Lex/Preprocessor.h"
22
#include "clang/Lex/Token.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/StringExtras.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include "llvm/Support/ConvertUTF.h"
28
#include "llvm/Support/Error.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include "llvm/Support/Unicode.h"
31
#include <algorithm>
32
#include <cassert>
33
#include <cstddef>
34
#include <cstdint>
35
#include <cstring>
36
#include <string>
37
38
using namespace clang;
39
40
10.1M
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41
10.1M
  switch (kind) {
42
0
  default: llvm_unreachable("Unknown token type!");
43
7.38k
  case tok::char_constant:
44
10.1M
  case tok::string_literal:
45
10.1M
  case tok::utf8_char_constant:
46
10.1M
  case tok::utf8_string_literal:
47
10.1M
    return Target.getCharWidth();
48
208
  case tok::wide_char_constant:
49
1.39k
  case tok::wide_string_literal:
50
1.39k
    return Target.getWCharWidth();
51
46
  case tok::utf16_char_constant:
52
243
  case tok::utf16_string_literal:
53
243
    return Target.getChar16Width();
54
39
  case tok::utf32_char_constant:
55
214
  case tok::utf32_string_literal:
56
214
    return Target.getChar32Width();
57
10.1M
  }
58
10.1M
}
59
60
104
static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61
104
  switch (kind) {
62
0
  default:
63
0
    llvm_unreachable("Unknown token type!");
64
0
  case tok::char_constant:
65
0
  case tok::string_literal:
66
0
    return 0;
67
0
  case tok::utf8_char_constant:
68
16
  case tok::utf8_string_literal:
69
16
    return 2;
70
0
  case tok::wide_char_constant:
71
58
  case tok::wide_string_literal:
72
58
  case tok::utf16_char_constant:
73
76
  case tok::utf16_string_literal:
74
76
  case tok::utf32_char_constant:
75
88
  case tok::utf32_string_literal:
76
88
    return 1;
77
104
  }
78
104
}
79
80
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81
                                           FullSourceLoc TokLoc,
82
                                           const char *TokBegin,
83
                                           const char *TokRangeBegin,
84
1.19k
                                           const char *TokRangeEnd) {
85
1.19k
  SourceLocation Begin =
86
1.19k
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
87
1.19k
                                   TokLoc.getManager(), Features);
88
1.19k
  SourceLocation End =
89
1.19k
    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
90
1.19k
                                   TokLoc.getManager(), Features);
91
1.19k
  return CharSourceRange::getCharRange(Begin, End);
92
1.19k
}
93
94
/// Produce a diagnostic highlighting some portion of a literal.
95
///
96
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
97
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
99
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100
                              const LangOptions &Features, FullSourceLoc TokLoc,
101
                              const char *TokBegin, const char *TokRangeBegin,
102
940
                              const char *TokRangeEnd, unsigned DiagID) {
103
940
  SourceLocation Begin =
104
940
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
105
940
                                   TokLoc.getManager(), Features);
106
940
  return Diags->Report(Begin, DiagID) <<
107
940
    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108
940
}
109
110
118
static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111
118
  switch (Escape) {
112
1
  case '\'':
113
82
  case '"':
114
83
  case '?':
115
84
  case '\\':
116
85
  case 'a':
117
86
  case 'b':
118
87
  case 'f':
119
88
  case 'n':
120
89
  case 'r':
121
90
  case 't':
122
91
  case 'v':
123
91
    return true;
124
118
  }
125
27
  return false;
126
118
}
127
128
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129
/// either a character or a string literal.
130
static unsigned ProcessCharEscape(const char *ThisTokBegin,
131
                                  const char *&ThisTokBuf,
132
                                  const char *ThisTokEnd, bool &HadError,
133
                                  FullSourceLoc Loc, unsigned CharWidth,
134
                                  DiagnosticsEngine *Diags,
135
                                  const LangOptions &Features,
136
84.6k
                                  StringLiteralEvalMethod EvalMethod) {
137
84.6k
  const char *EscapeBegin = ThisTokBuf;
138
84.6k
  bool Delimited = false;
139
84.6k
  bool EndDelimiterFound = false;
140
141
  // Skip the '\' char.
142
84.6k
  ++ThisTokBuf;
143
144
  // We know that this character can't be off the end of the buffer, because
145
  // that would have been \", which would not have been the end of string.
146
84.6k
  unsigned ResultChar = *ThisTokBuf++;
147
84.6k
  char Escape = ResultChar;
148
84.6k
  switch (ResultChar) {
149
  // These map to themselves.
150
1.10k
  
case '\\': 442
case '\'': 496
case '"': 1.10k
case '?': break;
151
152
    // These have fixed mappings.
153
30
  case 'a':
154
    // TODO: K&R: the meaning of '\\a' is different in traditional C
155
30
    ResultChar = 7;
156
30
    break;
157
29
  case 'b':
158
29
    ResultChar = 8;
159
29
    break;
160
18
  case 'e':
161
18
    if (Diags)
162
18
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
163
18
           diag::ext_nonstandard_escape) << "e";
164
18
    ResultChar = 27;
165
18
    break;
166
2
  case 'E':
167
2
    if (Diags)
168
2
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
169
2
           diag::ext_nonstandard_escape) << "E";
170
2
    ResultChar = 27;
171
2
    break;
172
21
  case 'f':
173
21
    ResultChar = 12;
174
21
    break;
175
71.2k
  case 'n':
176
71.2k
    ResultChar = 10;
177
71.2k
    break;
178
69
  case 'r':
179
69
    ResultChar = 13;
180
69
    break;
181
6.00k
  case 't':
182
6.00k
    ResultChar = 9;
183
6.00k
    break;
184
45
  case 'v':
185
45
    ResultChar = 11;
186
45
    break;
187
1.35k
  case 'x': { // Hex escape.
188
1.35k
    ResultChar = 0;
189
1.35k
    if (ThisTokBuf != ThisTokEnd && 
*ThisTokBuf == '{'1.35k
) {
190
84
      Delimited = true;
191
84
      ThisTokBuf++;
192
84
      if (*ThisTokBuf == '}') {
193
6
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194
6
             diag::err_delimited_escape_empty);
195
6
        return ResultChar;
196
6
      }
197
1.27k
    } else if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)1.26k
) {
198
6
      if (Diags)
199
6
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200
6
             diag::err_hex_escape_no_digits) << "x";
201
6
      return ResultChar;
202
6
    }
203
204
    // Hex escapes are a maximal series of hex digits.
205
1.34k
    bool Overflow = false;
206
4.51k
    for (; ThisTokBuf != ThisTokEnd; 
++ThisTokBuf3.17k
) {
207
3.55k
      if (Delimited && 
*ThisTokBuf == '}'398
) {
208
54
        ThisTokBuf++;
209
54
        EndDelimiterFound = true;
210
54
        break;
211
54
      }
212
3.50k
      int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213
3.50k
      if (CharVal == -1) {
214
        // Non delimited hex escape sequences stop at the first non-hex digit.
215
373
        if (!Delimited)
216
331
          break;
217
42
        HadError = true;
218
42
        if (Diags)
219
42
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220
42
               diag::err_delimited_escape_invalid)
221
42
              << StringRef(ThisTokBuf, 1);
222
42
        continue;
223
373
      }
224
      // About to shift out a digit?
225
3.13k
      if (ResultChar & 0xF0000000)
226
10
        Overflow = true;
227
3.13k
      ResultChar <<= 4;
228
3.13k
      ResultChar |= CharVal;
229
3.13k
    }
230
    // See if any bits will be truncated when evaluated as a character.
231
1.34k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 01.21k
) {
232
40
      Overflow = true;
233
40
      ResultChar &= ~0U >> (32-CharWidth);
234
40
    }
235
236
    // Check for overflow.
237
1.34k
    if (!HadError && 
Overflow1.31k
) { // Too many digits to fit in
238
28
      HadError = true;
239
28
      if (Diags)
240
28
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241
28
             diag::err_escape_too_large)
242
28
            << 0;
243
28
    }
244
1.34k
    break;
245
1.35k
  }
246
4.47k
  
case '0': 3.38k
case '1': 3.95k
case '2': 4.47k
case '3':
247
4.47k
  
case '4': 4.47k
case '5': 4.47k
case '6': 4.47k
case '7': {
248
    // Octal escapes.
249
4.47k
    --ThisTokBuf;
250
4.47k
    ResultChar = 0;
251
252
    // Octal escapes are a series of octal digits with maximum length 3.
253
    // "\0123" is a two digit sequence equal to "\012" "3".
254
4.47k
    unsigned NumDigits = 0;
255
4.81k
    do {
256
4.81k
      ResultChar <<= 3;
257
4.81k
      ResultChar |= *ThisTokBuf++ - '0';
258
4.81k
      ++NumDigits;
259
4.81k
    } while (ThisTokBuf != ThisTokEnd && 
NumDigits < 32.04k
&&
260
4.81k
             
ThisTokBuf[0] >= '0'2.02k
&&
ThisTokBuf[0] <= '7'1.94k
);
261
262
    // Check for overflow.  Reject '\777', but not L'\777'.
263
4.47k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 04.28k
) {
264
1
      if (Diags)
265
1
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
266
1
             diag::err_escape_too_large) << 1;
267
1
      ResultChar &= ~0U >> (32-CharWidth);
268
1
    }
269
4.47k
    break;
270
4.47k
  }
271
80
  case 'o': {
272
80
    bool Overflow = false;
273
80
    if (ThisTokBuf == ThisTokEnd || 
*ThisTokBuf != '{'74
) {
274
6
      HadError = true;
275
6
      if (Diags)
276
6
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
277
6
             diag::err_delimited_escape_missing_brace)
278
6
            << "o";
279
280
6
      break;
281
6
    }
282
74
    ResultChar = 0;
283
74
    Delimited = true;
284
74
    ++ThisTokBuf;
285
74
    if (*ThisTokBuf == '}') {
286
6
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287
6
           diag::err_delimited_escape_empty);
288
6
      return ResultChar;
289
6
    }
290
291
426
    
while (68
ThisTokBuf != ThisTokEnd) {
292
408
      if (*ThisTokBuf == '}') {
293
50
        EndDelimiterFound = true;
294
50
        ThisTokBuf++;
295
50
        break;
296
50
      }
297
358
      if (*ThisTokBuf < '0' || 
*ThisTokBuf > '7'352
) {
298
36
        HadError = true;
299
36
        if (Diags)
300
36
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301
36
               diag::err_delimited_escape_invalid)
302
36
              << StringRef(ThisTokBuf, 1);
303
36
        ThisTokBuf++;
304
36
        continue;
305
36
      }
306
      // Check if one of the top three bits is set before shifting them out.
307
322
      if (ResultChar & 0xE0000000)
308
12
        Overflow = true;
309
310
322
      ResultChar <<= 3;
311
322
      ResultChar |= *ThisTokBuf++ - '0';
312
322
    }
313
    // Check for overflow.  Reject '\777', but not L'\777'.
314
68
    if (!HadError &&
315
68
        
(44
Overflow44
||
(32
CharWidth != 3232
&&
(ResultChar >> CharWidth) != 028
))) {
316
20
      HadError = true;
317
20
      if (Diags)
318
20
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319
20
             diag::err_escape_too_large)
320
20
            << 1;
321
20
      ResultChar &= ~0U >> (32 - CharWidth);
322
20
    }
323
68
    break;
324
74
  }
325
    // Otherwise, these are not valid escapes.
326
48
  
case '(': 8
case '{': 16
case '[': 24
case '%':
327
    // GCC accepts these as extensions.  We warn about them as such though.
328
48
    if (Diags)
329
36
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
330
36
           diag::ext_nonstandard_escape)
331
36
        << std::string(1, ResultChar);
332
48
    break;
333
31
  default:
334
31
    if (!Diags)
335
0
      break;
336
337
31
    if (isPrintable(ResultChar))
338
29
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
339
29
           diag::ext_unknown_escape)
340
29
        << std::string(1, ResultChar);
341
2
    else
342
2
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
343
2
           diag::ext_unknown_escape)
344
2
        << "x" + llvm::utohexstr(ResultChar);
345
31
    break;
346
84.6k
  }
347
348
84.5k
  if (Delimited && 
Diags146
) {
349
146
    if (!EndDelimiterFound)
350
42
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351
42
           diag::err_expected)
352
42
          << tok::r_brace;
353
104
    else if (!HadError) {
354
30
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
355
30
           Features.CPlusPlus23 ? 
diag::warn_cxx23_delimited_escape_sequence5
356
30
                                : 
diag::ext_delimited_escape_sequence25
)
357
30
          << /*delimited*/ 0 << (Features.CPlusPlus ? 
120
:
010
);
358
30
    }
359
146
  }
360
361
84.5k
  if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362
84.5k
      
!IsEscapeValidInUnevaluatedStringLiteral(Escape)118
) {
363
27
    Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
364
27
         diag::err_unevaluated_string_invalid_escape_sequence)
365
27
        << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
366
27
    HadError = true;
367
27
  }
368
369
84.5k
  return ResultChar;
370
84.6k
}
371
372
static void appendCodePoint(unsigned Codepoint,
373
315
                            llvm::SmallVectorImpl<char> &Str) {
374
315
  char ResultBuf[4];
375
315
  char *ResultPtr = ResultBuf;
376
315
  if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
377
310
    Str.append(ResultBuf, ResultPtr);
378
315
}
379
380
1.25k
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381
4.91k
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; 
++I3.66k
) {
382
3.66k
    if (*I != '\\') {
383
3.34k
      Buf.push_back(*I);
384
3.34k
      continue;
385
3.34k
    }
386
387
315
    ++I;
388
315
    char Kind = *I;
389
315
    ++I;
390
391
315
    assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392
315
    uint32_t CodePoint = 0;
393
394
315
    if (Kind == 'u' && 
*I == '{'242
) {
395
39
      for (++I; *I != '}'; 
++I28
) {
396
28
        unsigned Value = llvm::hexDigitValue(*I);
397
28
        assert(Value != -1U);
398
28
        CodePoint <<= 4;
399
28
        CodePoint += Value;
400
28
      }
401
11
      appendCodePoint(CodePoint, Buf);
402
11
      continue;
403
11
    }
404
405
304
    if (Kind == 'N') {
406
40
      assert(*I == '{');
407
40
      ++I;
408
40
      auto Delim = std::find(I, Input.end(), '}');
409
40
      assert(Delim != Input.end());
410
40
      StringRef Name(I, std::distance(I, Delim));
411
40
      std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
412
40
          llvm::sys::unicode::nameToCodepointLooseMatching(Name);
413
40
      assert(Res && "could not find a codepoint that was previously found");
414
40
      CodePoint = Res->CodePoint;
415
40
      assert(CodePoint != 0xFFFFFFFF);
416
40
      appendCodePoint(CodePoint, Buf);
417
40
      I = Delim;
418
40
      continue;
419
40
    }
420
421
264
    unsigned NumHexDigits;
422
264
    if (Kind == 'u')
423
231
      NumHexDigits = 4;
424
33
    else
425
33
      NumHexDigits = 8;
426
427
264
    assert(I + NumHexDigits <= E);
428
429
1.45k
    
for (; 264
NumHexDigits != 0;
++I, --NumHexDigits1.18k
) {
430
1.18k
      unsigned Value = llvm::hexDigitValue(*I);
431
1.18k
      assert(Value != -1U);
432
433
1.18k
      CodePoint <<= 4;
434
1.18k
      CodePoint += Value;
435
1.18k
    }
436
437
264
    appendCodePoint(CodePoint, Buf);
438
264
    --I;
439
264
  }
440
1.25k
}
441
442
bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
443
7.81M
                                              const LangOptions &LO) {
444
7.81M
  return LO.MicrosoftExt &&
445
7.81M
         
(173k
K == tok::kw___FUNCTION__173k
||
K == tok::kw_L__FUNCTION__173k
||
446
173k
          
K == tok::kw___FUNCSIG__173k
||
K == tok::kw_L__FUNCSIG__173k
||
447
173k
          
K == tok::kw___FUNCDNAME__173k
);
448
7.81M
}
449
450
15.7M
bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
451
15.7M
  return tok::isStringLiteral(Tok.getKind()) ||
452
15.7M
         
isFunctionLocalStringLiteralMacro(Tok.getKind(), LO)7.72M
;
453
15.7M
}
454
455
static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
456
                                    const char *&ThisTokBuf,
457
                                    const char *ThisTokEnd, uint32_t &UcnVal,
458
                                    unsigned short &UcnLen, bool &Delimited,
459
                                    FullSourceLoc Loc, DiagnosticsEngine *Diags,
460
                                    const LangOptions &Features,
461
817
                                    bool in_char_string_literal = false) {
462
817
  const char *UcnBegin = ThisTokBuf;
463
817
  bool HasError = false;
464
817
  bool EndDelimiterFound = false;
465
466
  // Skip the '\u' char's.
467
817
  ThisTokBuf += 2;
468
817
  Delimited = false;
469
817
  if (UcnBegin[1] == 'u' && 
in_char_string_literal462
&&
470
817
      
ThisTokBuf != ThisTokEnd462
&&
*ThisTokBuf == '{'461
) {
471
66
    Delimited = true;
472
66
    ThisTokBuf++;
473
751
  } else if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)748
) {
474
10
    if (Diags)
475
10
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
476
10
           diag::err_hex_escape_no_digits)
477
10
          << StringRef(&ThisTokBuf[-1], 1);
478
10
    return false;
479
10
  }
480
807
  UcnLen = (ThisTokBuf[-1] == 'u' ? 
4395
:
8412
);
481
482
807
  bool Overflow = false;
483
807
  unsigned short Count = 0;
484
5.37k
  for (; ThisTokBuf != ThisTokEnd && 
(4.76k
Delimited4.76k
||
Count != UcnLen4.47k
);
485
4.62k
       
++ThisTokBuf4.56k
) {
486
4.62k
    if (Delimited && 
*ThisTokBuf == '}'288
) {
487
48
      ++ThisTokBuf;
488
48
      EndDelimiterFound = true;
489
48
      break;
490
48
    }
491
4.57k
    int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492
4.57k
    if (CharVal == -1) {
493
16
      HasError = true;
494
16
      if (!Delimited)
495
4
        break;
496
12
      if (Diags) {
497
12
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498
12
             diag::err_delimited_escape_invalid)
499
12
            << StringRef(ThisTokBuf, 1);
500
12
      }
501
12
      Count++;
502
12
      continue;
503
16
    }
504
4.55k
    if (UcnVal & 0xF0000000) {
505
6
      Overflow = true;
506
6
      continue;
507
6
    }
508
4.55k
    UcnVal <<= 4;
509
4.55k
    UcnVal |= CharVal;
510
4.55k
    Count++;
511
4.55k
  }
512
513
807
  if (Overflow) {
514
6
    if (Diags)
515
6
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516
6
           diag::err_escape_too_large)
517
6
          << 0;
518
6
    return false;
519
6
  }
520
521
801
  if (Delimited && 
!EndDelimiterFound60
) {
522
18
    if (Diags) {
523
18
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524
18
           diag::err_expected)
525
18
          << tok::r_brace;
526
18
    }
527
18
    return false;
528
18
  }
529
530
  // If we didn't consume the proper number of digits, there is a problem.
531
783
  if (Count == 0 || 
(777
!Delimited777
&&
Count != UcnLen741
)) {
532
12
    if (Diags)
533
12
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534
12
           Delimited ? 
diag::err_delimited_escape_empty6
535
12
                     : 
diag::err_ucn_escape_incomplete6
);
536
12
    return false;
537
12
  }
538
771
  return !HasError;
539
783
}
540
541
static void DiagnoseInvalidUnicodeCharacterName(
542
    DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
543
    const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
544
29
    llvm::StringRef Name) {
545
546
29
  Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
547
29
       diag::err_invalid_ucn_name)
548
29
      << Name;
549
550
29
  namespace u = llvm::sys::unicode;
551
552
29
  std::optional<u::LooseMatchingResult> Res =
553
29
      u::nameToCodepointLooseMatching(Name);
554
29
  if (Res) {
555
8
    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
556
8
         diag::note_invalid_ucn_name_loose_matching)
557
8
        << FixItHint::CreateReplacement(
558
8
               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
559
8
                                   TokRangeEnd),
560
8
               Res->Name);
561
8
    return;
562
8
  }
563
564
21
  unsigned Distance = 0;
565
21
  SmallVector<u::MatchForCodepointName> Matches =
566
21
      u::nearestMatchesForCodepointName(Name, 5);
567
21
  assert(!Matches.empty() && "No unicode characters found");
568
569
102
  
for (const auto &Match : Matches)21
{
570
102
    if (Distance == 0)
571
21
      Distance = Match.Distance;
572
102
    if (std::max(Distance, Match.Distance) -
573
102
            std::min(Distance, Match.Distance) >
574
102
        3)
575
1
      break;
576
101
    Distance = Match.Distance;
577
578
101
    std::string Str;
579
101
    llvm::UTF32 V = Match.Value;
580
101
    bool Converted =
581
101
        llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582
101
    (void)Converted;
583
101
    assert(Converted && "Found a match wich is not a unicode character");
584
585
101
    Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
586
101
         diag::note_invalid_ucn_name_candidate)
587
101
        << Match.Name << llvm::utohexstr(Match.Value)
588
101
        << Str // FIXME: Fix the rendering of non printable characters
589
101
        << FixItHint::CreateReplacement(
590
101
               MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
591
101
                                   TokRangeEnd),
592
101
               Match.Name);
593
101
  }
594
21
}
595
596
static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
597
                                  const char *&ThisTokBuf,
598
                                  const char *ThisTokEnd, uint32_t &UcnVal,
599
                                  unsigned short &UcnLen, FullSourceLoc Loc,
600
                                  DiagnosticsEngine *Diags,
601
100
                                  const LangOptions &Features) {
602
100
  const char *UcnBegin = ThisTokBuf;
603
100
  assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
604
100
  ThisTokBuf += 2;
605
100
  if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
606
6
    if (Diags) {
607
6
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
608
6
           diag::err_delimited_escape_missing_brace)
609
6
          << StringRef(&ThisTokBuf[-1], 1);
610
6
    }
611
6
    return false;
612
6
  }
613
94
  ThisTokBuf++;
614
1.14k
  const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615
1.14k
    return C == '}' || 
isVerticalWhitespace(C)1.06k
;
616
1.14k
  });
617
94
  bool Incomplete = ClosingBrace == ThisTokEnd;
618
94
  bool Empty = ClosingBrace == ThisTokBuf;
619
94
  if (Incomplete || 
Empty82
) {
620
18
    if (Diags) {
621
18
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
622
18
           Incomplete ? 
diag::err_ucn_escape_incomplete12
623
18
                      : 
diag::err_delimited_escape_empty6
)
624
18
          << StringRef(&UcnBegin[1], 1);
625
18
    }
626
18
    ThisTokBuf = ClosingBrace == ThisTokEnd ? 
ClosingBrace12
:
ClosingBrace + 16
;
627
18
    return false;
628
18
  }
629
76
  StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
630
76
  ThisTokBuf = ClosingBrace + 1;
631
76
  std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
632
76
  if (!Res) {
633
29
    if (Diags)
634
29
      DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
635
29
                                          &UcnBegin[3], ClosingBrace, Name);
636
29
    return false;
637
29
  }
638
47
  UcnVal = *Res;
639
47
  UcnLen = UcnVal > 0xFFFF ? 
818
:
429
;
640
47
  return true;
641
76
}
642
643
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
644
/// return the UTF32.
645
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
646
                             const char *ThisTokEnd, uint32_t &UcnVal,
647
                             unsigned short &UcnLen, FullSourceLoc Loc,
648
                             DiagnosticsEngine *Diags,
649
                             const LangOptions &Features,
650
917
                             bool in_char_string_literal = false) {
651
652
917
  bool HasError;
653
917
  const char *UcnBegin = ThisTokBuf;
654
917
  bool IsDelimitedEscapeSequence = false;
655
917
  bool IsNamedEscapeSequence = false;
656
917
  if (ThisTokBuf[1] == 'N') {
657
100
    IsNamedEscapeSequence = true;
658
100
    HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
659
100
                                      UcnVal, UcnLen, Loc, Diags, Features);
660
817
  } else {
661
817
    HasError =
662
817
        !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
663
817
                                 UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
664
817
                                 Features, in_char_string_literal);
665
817
  }
666
917
  if (HasError)
667
111
    return false;
668
669
  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
670
806
  if ((0xD800 <= UcnVal && 
UcnVal <= 0xDFFF331
) || // surrogate codepoints
671
806
      
UcnVal > 0x10FFFF746
) { // maximum legal UTF32 value
672
86
    if (Diags)
673
86
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
674
86
           diag::err_ucn_escape_invalid);
675
86
    return false;
676
86
  }
677
678
  // C23 and C++11 allow UCNs that refer to control characters
679
  // and basic source characters inside character and string literals
680
720
  if (UcnVal < 0xa0 &&
681
      // $, @, ` are allowed in all language modes
682
720
      
(229
UcnVal != 0x24229
&&
UcnVal != 0x40212
&&
UcnVal != 0x60202
)) {
683
192
    bool IsError =
684
192
        (!(Features.CPlusPlus11 || 
Features.C2361
) ||
!in_char_string_literal145
);
685
192
    if (Diags) {
686
192
      char BasicSCSChar = UcnVal;
687
192
      if (UcnVal >= 0x20 && 
UcnVal < 0x7f127
)
688
60
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
689
60
             IsError ? 
diag::err_ucn_escape_basic_scs20
690
60
             : 
Features.CPlusPlus40
691
40
                 ? 
diag::warn_cxx98_compat_literal_ucn_escape_basic_scs39
692
40
                 : 
diag::warn_c23_compat_literal_ucn_escape_basic_scs1
)
693
60
            << StringRef(&BasicSCSChar, 1);
694
132
      else
695
132
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
696
132
             IsError ? 
diag::err_ucn_control_character27
697
132
             : 
Features.CPlusPlus105
698
105
                 ? 
diag::warn_cxx98_compat_literal_ucn_control_character92
699
105
                 : 
diag::warn_c23_compat_literal_ucn_control_character13
);
700
192
    }
701
192
    if (IsError)
702
47
      return false;
703
192
  }
704
705
673
  if (!Features.CPlusPlus && 
!Features.C99220
&&
Diags2
)
706
2
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
707
2
         diag::warn_ucn_not_valid_in_c89_literal);
708
709
673
  if ((IsDelimitedEscapeSequence || 
IsNamedEscapeSequence651
) &&
Diags69
)
710
69
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
711
69
         Features.CPlusPlus23 ? 
diag::warn_cxx23_delimited_escape_sequence11
712
69
                              : 
diag::ext_delimited_escape_sequence58
)
713
69
        << (IsNamedEscapeSequence ? 
147
:
022
) << (Features.CPlusPlus ?
149
:
020
);
714
715
673
  return true;
716
720
}
717
718
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
719
/// which this UCN will occupy.
720
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
721
                            const char *ThisTokEnd, unsigned CharByteWidth,
722
6
                            const LangOptions &Features, bool &HadError) {
723
  // UTF-32: 4 bytes per escape.
724
6
  if (CharByteWidth == 4)
725
0
    return 4;
726
727
6
  uint32_t UcnVal = 0;
728
6
  unsigned short UcnLen = 0;
729
6
  FullSourceLoc Loc;
730
731
6
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
732
6
                        UcnLen, Loc, nullptr, Features, true)) {
733
0
    HadError = true;
734
0
    return 0;
735
0
  }
736
737
  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
738
6
  if (CharByteWidth == 2)
739
0
    return UcnVal <= 0xFFFF ? 2 : 4;
740
741
  // UTF-8.
742
6
  if (UcnVal < 0x80)
743
0
    return 1;
744
6
  if (UcnVal < 0x800)
745
0
    return 2;
746
6
  if (UcnVal < 0x10000)
747
3
    return 3;
748
3
  return 4;
749
6
}
750
751
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
752
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
753
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
754
/// we will likely rework our support for UCN's.
755
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
756
                            const char *ThisTokEnd,
757
                            char *&ResultBuf, bool &HadError,
758
                            FullSourceLoc Loc, unsigned CharByteWidth,
759
                            DiagnosticsEngine *Diags,
760
578
                            const LangOptions &Features) {
761
578
  typedef uint32_t UTF32;
762
578
  UTF32 UcnVal = 0;
763
578
  unsigned short UcnLen = 0;
764
578
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
765
578
                        Loc, Diags, Features, true)) {
766
143
    HadError = true;
767
143
    return;
768
143
  }
769
770
435
  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
771
435
         "only character widths of 1, 2, or 4 bytes supported");
772
773
435
  (void)UcnLen;
774
435
  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
775
776
435
  if (CharByteWidth == 4) {
777
    // FIXME: Make the type of the result buffer correct instead of
778
    // using reinterpret_cast.
779
83
    llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
780
83
    *ResultPtr = UcnVal;
781
83
    ResultBuf += 4;
782
83
    return;
783
83
  }
784
785
352
  if (CharByteWidth == 2) {
786
    // FIXME: Make the type of the result buffer correct instead of
787
    // using reinterpret_cast.
788
90
    llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
789
790
90
    if (UcnVal <= (UTF32)0xFFFF) {
791
54
      *ResultPtr = UcnVal;
792
54
      ResultBuf += 2;
793
54
      return;
794
54
    }
795
796
    // Convert to UTF16.
797
36
    UcnVal -= 0x10000;
798
36
    *ResultPtr     = 0xD800 + (UcnVal >> 10);
799
36
    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
800
36
    ResultBuf += 4;
801
36
    return;
802
90
  }
803
804
262
  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
805
806
  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
807
  // The conversion below was inspired by:
808
  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
809
  // First, we determine how many bytes the result will require.
810
262
  typedef uint8_t UTF8;
811
812
262
  unsigned short bytesToWrite = 0;
813
262
  if (UcnVal < (UTF32)0x80)
814
36
    bytesToWrite = 1;
815
226
  else if (UcnVal < (UTF32)0x800)
816
48
    bytesToWrite = 2;
817
178
  else if (UcnVal < (UTF32)0x10000)
818
113
    bytesToWrite = 3;
819
65
  else
820
65
    bytesToWrite = 4;
821
822
262
  const unsigned byteMask = 0xBF;
823
262
  const unsigned byteMark = 0x80;
824
825
  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
826
  // into the first byte, depending on how many bytes follow.
827
262
  static const UTF8 firstByteMark[5] = {
828
262
    0x00, 0x00, 0xC0, 0xE0, 0xF0
829
262
  };
830
  // Finally, we write the bytes into ResultBuf.
831
262
  ResultBuf += bytesToWrite;
832
262
  switch (bytesToWrite) { // note: everything falls through.
833
65
  case 4:
834
65
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835
65
    [[fallthrough]];
836
178
  case 3:
837
178
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838
178
    [[fallthrough]];
839
226
  case 2:
840
226
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841
226
    [[fallthrough]];
842
262
  case 1:
843
262
    *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
844
262
  }
845
  // Update the buffer.
846
262
  ResultBuf += bytesToWrite;
847
262
}
848
849
///       integer-constant: [C99 6.4.4.1]
850
///         decimal-constant integer-suffix
851
///         octal-constant integer-suffix
852
///         hexadecimal-constant integer-suffix
853
///         binary-literal integer-suffix [GNU, C++1y]
854
///       user-defined-integer-literal: [C++11 lex.ext]
855
///         decimal-literal ud-suffix
856
///         octal-literal ud-suffix
857
///         hexadecimal-literal ud-suffix
858
///         binary-literal ud-suffix [GNU, C++1y]
859
///       decimal-constant:
860
///         nonzero-digit
861
///         decimal-constant digit
862
///       octal-constant:
863
///         0
864
///         octal-constant octal-digit
865
///       hexadecimal-constant:
866
///         hexadecimal-prefix hexadecimal-digit
867
///         hexadecimal-constant hexadecimal-digit
868
///       hexadecimal-prefix: one of
869
///         0x 0X
870
///       binary-literal:
871
///         0b binary-digit
872
///         0B binary-digit
873
///         binary-literal binary-digit
874
///       integer-suffix:
875
///         unsigned-suffix [long-suffix]
876
///         unsigned-suffix [long-long-suffix]
877
///         long-suffix [unsigned-suffix]
878
///         long-long-suffix [unsigned-sufix]
879
///       nonzero-digit:
880
///         1 2 3 4 5 6 7 8 9
881
///       octal-digit:
882
///         0 1 2 3 4 5 6 7
883
///       hexadecimal-digit:
884
///         0 1 2 3 4 5 6 7 8 9
885
///         a b c d e f
886
///         A B C D E F
887
///       binary-digit:
888
///         0
889
///         1
890
///       unsigned-suffix: one of
891
///         u U
892
///       long-suffix: one of
893
///         l L
894
///       long-long-suffix: one of
895
///         ll LL
896
///
897
///       floating-constant: [C99 6.4.4.2]
898
///         TODO: add rules...
899
///
900
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
901
                                           SourceLocation TokLoc,
902
                                           const SourceManager &SM,
903
                                           const LangOptions &LangOpts,
904
                                           const TargetInfo &Target,
905
                                           DiagnosticsEngine &Diags)
906
    : SM(SM), LangOpts(LangOpts), Diags(Diags),
907
7.83M
      ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
908
909
7.83M
  s = DigitsBegin = ThisTokBegin;
910
7.83M
  saw_exponent = false;
911
7.83M
  saw_period = false;
912
7.83M
  saw_ud_suffix = false;
913
7.83M
  saw_fixed_point_suffix = false;
914
7.83M
  isLong = false;
915
7.83M
  isUnsigned = false;
916
7.83M
  isLongLong = false;
917
7.83M
  isSizeT = false;
918
7.83M
  isHalf = false;
919
7.83M
  isFloat = false;
920
7.83M
  isImaginary = false;
921
7.83M
  isFloat16 = false;
922
7.83M
  isFloat128 = false;
923
7.83M
  MicrosoftInteger = 0;
924
7.83M
  isFract = false;
925
7.83M
  isAccum = false;
926
7.83M
  hadError = false;
927
7.83M
  isBitInt = false;
928
929
  // This routine assumes that the range begin/end matches the regex for integer
930
  // and FP constants (specifically, the 'pp-number' regex), and assumes that
931
  // the byte at "*end" is both valid and not part of the regex.  Because of
932
  // this, it doesn't have to check for 'overscan' in various places.
933
7.83M
  if (isPreprocessingNumberBody(*ThisTokEnd)) {
934
0
    Diags.Report(TokLoc, diag::err_lexing_numeric);
935
0
    hadError = true;
936
0
    return;
937
0
  }
938
939
7.83M
  if (*s == '0') { // parse radix
940
1.34M
    ParseNumberStartingWithZero(TokLoc);
941
1.34M
    if (hadError)
942
47
      return;
943
6.49M
  } else { // the first digit is non-zero
944
6.49M
    radix = 10;
945
6.49M
    s = SkipDigits(s);
946
6.49M
    if (s == ThisTokEnd) {
947
      // Done.
948
6.00M
    } else {
949
493k
      ParseDecimalOrOctalCommon(TokLoc);
950
493k
      if (hadError)
951
12
        return;
952
493k
    }
953
6.49M
  }
954
955
7.83M
  SuffixBegin = s;
956
7.83M
  checkSeparator(TokLoc, s, CSK_AfterDigits);
957
958
  // Initial scan to lookahead for fixed point suffix.
959
7.83M
  if (LangOpts.FixedPoint) {
960
1.81k
    for (const char *c = s; c != ThisTokEnd; 
++c886
) {
961
1.74k
      if (*c == 'r' || 
*c == 'k'1.52k
||
*c == 'R'886
||
*c == 'K'886
) {
962
858
        saw_fixed_point_suffix = true;
963
858
        break;
964
858
      }
965
1.74k
    }
966
928
  }
967
968
  // Parse the suffix.  At this point we can classify whether we have an FP or
969
  // integer constant.
970
7.83M
  bool isFixedPointConstant = isFixedPointLiteral();
971
7.83M
  bool isFPConstant = isFloatingLiteral();
972
7.83M
  bool HasSize = false;
973
974
  // Loop over all of the characters of the suffix.  If we see something bad,
975
  // we break out of the loop.
976
8.50M
  for (; s != ThisTokEnd; 
++s662k
) {
977
662k
    switch (*s) {
978
0
    case 'R':
979
224
    case 'r':
980
224
      if (!LangOpts.FixedPoint)
981
6
        break;
982
218
      if (isFract || 
isAccum217
)
break1
;
983
217
      if (!(saw_period || 
saw_exponent20
))
break14
;
984
203
      isFract = true;
985
203
      continue;
986
0
    case 'K':
987
632
    case 'k':
988
632
      if (!LangOpts.FixedPoint)
989
6
        break;
990
626
      if (isFract || 
isAccum625
)
break2
;
991
624
      if (!(saw_period || 
saw_exponent43
))
break16
;
992
608
      isAccum = true;
993
608
      continue;
994
516
    case 'h':      // FP Suffix for "half".
995
518
    case 'H':
996
      // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
997
518
      if (!(LangOpts.Half || 
LangOpts.FixedPoint495
))
998
3
        break;
999
515
      if (isIntegerLiteral()) 
break13
; // Error for integer constant.
1000
502
      if (HasSize)
1001
3
        break;
1002
499
      HasSize = true;
1003
499
      isHalf = true;
1004
499
      continue;  // Success.
1005
18.8k
    case 'f':      // FP Suffix for "float"
1006
22.7k
    case 'F':
1007
22.7k
      if (!isFPConstant) 
break4
; // Error for integer constant.
1008
22.7k
      if (HasSize)
1009
1
        break;
1010
22.7k
      HasSize = true;
1011
1012
      // CUDA host and device may have different _Float16 support, therefore
1013
      // allows f16 literals to avoid false alarm.
1014
      // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1015
      // should also be supported.
1016
      // ToDo: more precise check for CUDA.
1017
      // TODO: AMDGPU might also support it in the future.
1018
22.7k
      if ((Target.hasFloat16Type() || 
LangOpts.CUDA2.24k
||
1019
22.7k
           
(2.23k
LangOpts.OpenMPIsTargetDevice2.23k
&&
Target.getTriple().isNVPTX()8
)) &&
1020
22.7k
          
s + 2 < ThisTokEnd20.4k
&&
s[1] == '1'1.73k
&&
s[2] == '6'1.72k
) {
1021
1.72k
        s += 2; // success, eat up 2 characters.
1022
1.72k
        isFloat16 = true;
1023
1.72k
        continue;
1024
1.72k
      }
1025
1026
20.9k
      isFloat = true;
1027
20.9k
      continue;  // Success.
1028
171
    case 'q':    // FP Suffix for "__float128"
1029
171
    case 'Q':
1030
171
      if (!isFPConstant) 
break1
; // Error for integer constant.
1031
170
      if (HasSize)
1032
0
        break;
1033
170
      HasSize = true;
1034
170
      isFloat128 = true;
1035
170
      continue;  // Success.
1036
18.2k
    case 'u':
1037
124k
    case 'U':
1038
124k
      if (isFPConstant) 
break8
; // Error for floating constant.
1039
124k
      if (isUnsigned) 
break1
; // Cannot be repeated.
1040
124k
      isUnsigned = true;
1041
124k
      continue;  // Success.
1042
6.14k
    case 'l':
1043
513k
    case 'L':
1044
513k
      if (HasSize)
1045
18
        break;
1046
513k
      HasSize = true;
1047
1048
      // Check for long long.  The L's need to be adjacent and the same case.
1049
513k
      if (s[1] == s[0]) {
1050
48.7k
        assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1051
48.7k
        if (isFPConstant) 
break0
; // long long invalid for floats.
1052
48.7k
        isLongLong = true;
1053
48.7k
        ++s;  // Eat both of them.
1054
464k
      } else {
1055
464k
        isLong = true;
1056
464k
      }
1057
513k
      continue; // Success.
1058
513k
    case 'z':
1059
150
    case 'Z':
1060
150
      if (isFPConstant)
1061
12
        break; // Invalid for floats.
1062
138
      if (HasSize)
1063
20
        break;
1064
118
      HasSize = true;
1065
118
      isSizeT = true;
1066
118
      continue;
1067
317
    case 'i':
1068
322
    case 'I':
1069
322
      if (LangOpts.MicrosoftExt && 
!isFPConstant71
) {
1070
        // Allow i8, i16, i32, and i64. First, look ahead and check if
1071
        // suffixes are Microsoft integers and not the imaginary unit.
1072
63
        uint8_t Bits = 0;
1073
63
        size_t ToSkip = 0;
1074
63
        switch (s[1]) {
1075
10
        case '8': // i8 suffix
1076
10
          Bits = 8;
1077
10
          ToSkip = 2;
1078
10
          break;
1079
9
        case '1':
1080
9
          if (s[2] == '6') { // i16 suffix
1081
9
            Bits = 16;
1082
9
            ToSkip = 3;
1083
9
          }
1084
9
          break;
1085
9
        case '3':
1086
9
          if (s[2] == '2') { // i32 suffix
1087
9
            Bits = 32;
1088
9
            ToSkip = 3;
1089
9
          }
1090
9
          break;
1091
32
        case '6':
1092
32
          if (s[2] == '4') { // i64 suffix
1093
32
            Bits = 64;
1094
32
            ToSkip = 3;
1095
32
          }
1096
32
          break;
1097
3
        default:
1098
3
          break;
1099
63
        }
1100
63
        if (Bits) {
1101
60
          if (HasSize)
1102
6
            break;
1103
54
          HasSize = true;
1104
54
          MicrosoftInteger = Bits;
1105
54
          s += ToSkip;
1106
54
          assert(s <= ThisTokEnd && "didn't maximally munch?");
1107
54
          break;
1108
54
        }
1109
63
      }
1110
322
      
[[fallthrough]];262
1111
389
    case 'j':
1112
389
    case 'J':
1113
389
      if (isImaginary) 
break0
; // Cannot be repeated.
1114
389
      isImaginary = true;
1115
389
      continue;  // Success.
1116
100
    case 'w':
1117
106
    case 'W':
1118
106
      if (isFPConstant)
1119
3
        break; // Invalid for floats.
1120
103
      if (HasSize)
1121
7
        break; // Invalid if we already have a size for the literal.
1122
1123
      // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1124
      // explicitly do not support the suffix in C++ as an extension because a
1125
      // library-based UDL that resolves to a library type may be more
1126
      // appropriate there.
1127
96
      if (!LangOpts.CPlusPlus && 
(93
(93
s[0] == 'w'93
&&
s[1] == 'b'87
) ||
1128
93
          
(7
s[0] == 'W'7
&&
s[1] == 'B'6
))) {
1129
91
        isBitInt = true;
1130
91
        HasSize = true;
1131
91
        ++s; // Skip both characters (2nd char skipped on continue).
1132
91
        continue; // Success.
1133
91
      }
1134
662k
    }
1135
    // If we reached here, there was an error or a ud-suffix.
1136
555
    break;
1137
662k
  }
1138
1139
  // "i", "if", and "il" are user-defined suffixes in C++1y.
1140
7.83M
  if (s != ThisTokEnd || 
isImaginary7.83M
) {
1141
    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1142
890
    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1143
890
    if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1144
357
      if (!isImaginary) {
1145
        // Any suffix pieces we might have parsed are actually part of the
1146
        // ud-suffix.
1147
311
        isLong = false;
1148
311
        isUnsigned = false;
1149
311
        isLongLong = false;
1150
311
        isSizeT = false;
1151
311
        isFloat = false;
1152
311
        isFloat16 = false;
1153
311
        isHalf = false;
1154
311
        isImaginary = false;
1155
311
        isBitInt = false;
1156
311
        MicrosoftInteger = 0;
1157
311
        saw_fixed_point_suffix = false;
1158
311
        isFract = false;
1159
311
        isAccum = false;
1160
311
      }
1161
1162
357
      saw_ud_suffix = true;
1163
357
      return;
1164
357
    }
1165
1166
533
    if (s != ThisTokEnd) {
1167
      // Report an error if there are any.
1168
190
      Diags.Report(Lexer::AdvanceToTokenCharacter(
1169
190
                       TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1170
190
                   diag::err_invalid_suffix_constant)
1171
190
          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1172
190
          << (isFixedPointConstant ? 
27
:
isFPConstant183
);
1173
190
      hadError = true;
1174
190
    }
1175
533
  }
1176
1177
7.83M
  if (!hadError && 
saw_fixed_point_suffix7.83M
) {
1178
808
    assert(isFract || isAccum);
1179
808
  }
1180
7.83M
}
1181
1182
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1183
/// numbers. It issues an error for illegal digits, and handles floating point
1184
/// parsing. If it detects a floating point number, the radix is set to 10.
1185
543k
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1186
543k
  assert((radix == 8 || radix == 10) && "Unexpected radix");
1187
1188
  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1189
  // the code is using an incorrect base.
1190
543k
  if (isHexDigit(*s) && 
*s != 'e'346
&&
*s != 'E'48
&&
1191
543k
      
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))15
) {
1192
13
    Diags.Report(
1193
13
        Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1194
13
        diag::err_invalid_digit)
1195
13
        << StringRef(s, 1) << (radix == 8 ? 
111
:
02
);
1196
13
    hadError = true;
1197
13
    return;
1198
13
  }
1199
1200
543k
  if (*s == '.') {
1201
71.3k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
1202
71.3k
    s++;
1203
71.3k
    radix = 10;
1204
71.3k
    saw_period = true;
1205
71.3k
    checkSeparator(TokLoc, s, CSK_BeforeDigits);
1206
71.3k
    s = SkipDigits(s); // Skip suffix.
1207
71.3k
  }
1208
543k
  if (*s == 'e' || 
*s == 'E'534k
) { // exponent
1209
9.08k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
1210
9.08k
    const char *Exponent = s;
1211
9.08k
    s++;
1212
9.08k
    radix = 10;
1213
9.08k
    saw_exponent = true;
1214
9.08k
    if (s != ThisTokEnd && 
(9.07k
*s == '+'9.07k
||
*s == '-'7.42k
))
s++8.74k
; // sign
1215
9.08k
    const char *first_non_digit = SkipDigits(s);
1216
9.08k
    if (containsDigits(s, first_non_digit)) {
1217
9.07k
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
1218
9.07k
      s = first_non_digit;
1219
9.07k
    } else {
1220
6
      if (!hadError) {
1221
4
        Diags.Report(Lexer::AdvanceToTokenCharacter(
1222
4
                         TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1223
4
                     diag::err_exponent_has_no_digits);
1224
4
        hadError = true;
1225
4
      }
1226
6
      return;
1227
6
    }
1228
9.08k
  }
1229
543k
}
1230
1231
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1232
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1233
/// treat it as an invalid suffix.
1234
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1235
1.89k
                                           StringRef Suffix) {
1236
1.89k
  if (!LangOpts.CPlusPlus11 || 
Suffix.empty()1.59k
)
1237
298
    return false;
1238
1239
  // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1240
1.59k
  if (Suffix[0] == '_')
1241
303
    return true;
1242
1243
  // In C++11, there are no library suffixes.
1244
1.29k
  if (!LangOpts.CPlusPlus14)
1245
36
    return false;
1246
1247
  // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1248
  // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1249
  // In C++2a "d" and "y" are used in the library.
1250
1.26k
  return llvm::StringSwitch<bool>(Suffix)
1251
1.26k
      .Cases("h", "min", "s", true)
1252
1.26k
      .Cases("ms", "us", "ns", true)
1253
1.26k
      .Cases("il", "i", "if", true)
1254
1.26k
      .Cases("d", "y", LangOpts.CPlusPlus20)
1255
1.26k
      .Default(false);
1256
1.29k
}
1257
1258
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1259
                                          const char *Pos,
1260
8.00M
                                          CheckSeparatorKind IsAfterDigits) {
1261
8.00M
  if (IsAfterDigits == CSK_AfterDigits) {
1262
7.91M
    if (Pos == ThisTokBegin)
1263
563
      return;
1264
7.91M
    --Pos;
1265
7.91M
  } else 
if (80.9k
Pos == ThisTokEnd80.9k
)
1266
640
    return;
1267
1268
7.99M
  if (isDigitSeparator(*Pos)) {
1269
38
    Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1270
38
                                                LangOpts),
1271
38
                 diag::err_digit_separator_not_between_digits)
1272
38
        << IsAfterDigits;
1273
38
    hadError = true;
1274
38
  }
1275
7.99M
}
1276
1277
/// ParseNumberStartingWithZero - This method is called when the first character
1278
/// of the number is found to be a zero.  This means it is either an octal
1279
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1280
/// a floating point number (01239.123e4).  Eat the prefix, determining the
1281
/// radix etc.
1282
1.34M
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1283
1.34M
  assert(s[0] == '0' && "Invalid method call");
1284
1.34M
  s++;
1285
1286
1.34M
  int c1 = s[0];
1287
1288
  // Handle a hex number like 0x1234.
1289
1.34M
  if ((c1 == 'x' || 
c1 == 'X'318k
) &&
(1.02M
isHexDigit(s[1])1.02M
||
s[1] == '.'22
)) {
1290
1.02M
    s++;
1291
1.02M
    assert(s < ThisTokEnd && "didn't maximally munch?");
1292
1.02M
    radix = 16;
1293
1.02M
    DigitsBegin = s;
1294
1.02M
    s = SkipHexDigits(s);
1295
1.02M
    bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1296
1.02M
    if (s == ThisTokEnd) {
1297
      // Done.
1298
945k
    } else 
if (79.6k
*s == '.'79.6k
) {
1299
227
      s++;
1300
227
      saw_period = true;
1301
227
      const char *floatDigitsBegin = s;
1302
227
      s = SkipHexDigits(s);
1303
227
      if (containsDigits(floatDigitsBegin, s))
1304
213
        HasSignificandDigits = true;
1305
227
      if (HasSignificandDigits)
1306
221
        checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1307
227
    }
1308
1309
1.02M
    if (!HasSignificandDigits) {
1310
6
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1311
6
                                                  LangOpts),
1312
6
                   diag::err_hex_constant_requires)
1313
6
          << LangOpts.CPlusPlus << 1;
1314
6
      hadError = true;
1315
6
      return;
1316
6
    }
1317
1318
    // A binary exponent can appear with or with a '.'. If dotted, the
1319
    // binary exponent is required.
1320
1.02M
    if (*s == 'p' || 
*s == 'P'1.02M
) {
1321
259
      checkSeparator(TokLoc, s, CSK_AfterDigits);
1322
259
      const char *Exponent = s;
1323
259
      s++;
1324
259
      saw_exponent = true;
1325
259
      if (s != ThisTokEnd && 
(257
*s == '+'257
||
*s == '-'180
))
s++104
; // sign
1326
259
      const char *first_non_digit = SkipDigits(s);
1327
259
      if (!containsDigits(s, first_non_digit)) {
1328
4
        if (!hadError) {
1329
2
          Diags.Report(Lexer::AdvanceToTokenCharacter(
1330
2
                           TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1331
2
                       diag::err_exponent_has_no_digits);
1332
2
          hadError = true;
1333
2
        }
1334
4
        return;
1335
4
      }
1336
255
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
1337
255
      s = first_non_digit;
1338
1339
255
      if (!LangOpts.HexFloats)
1340
23
        Diags.Report(TokLoc, LangOpts.CPlusPlus
1341
23
                                 ? 
diag::ext_hex_literal_invalid20
1342
23
                                 : 
diag::ext_hex_constant_invalid3
);
1343
232
      else if (LangOpts.CPlusPlus17)
1344
132
        Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1345
1.02M
    } else if (saw_period) {
1346
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1347
2
                                                  LangOpts),
1348
2
                   diag::err_hex_constant_requires)
1349
2
          << LangOpts.CPlusPlus << 0;
1350
2
      hadError = true;
1351
2
    }
1352
1.02M
    return;
1353
1.02M
  }
1354
1355
  // Handle simple binary numbers 0b01010
1356
318k
  if ((c1 == 'b' || 
c1 == 'B'318k
) &&
(109
s[1] == '0'109
||
s[1] == '1'86
)) {
1357
    // 0b101010 is a C++1y / GCC extension.
1358
102
    Diags.Report(TokLoc, LangOpts.CPlusPlus14
1359
102
                             ? 
diag::warn_cxx11_compat_binary_literal61
1360
102
                         : 
LangOpts.CPlusPlus41
?
diag::ext_binary_literal_cxx1415
1361
41
                                              : 
diag::ext_binary_literal26
);
1362
102
    ++s;
1363
102
    assert(s < ThisTokEnd && "didn't maximally munch?");
1364
102
    radix = 2;
1365
102
    DigitsBegin = s;
1366
102
    s = SkipBinaryDigits(s);
1367
102
    if (s == ThisTokEnd) {
1368
      // Done.
1369
77
    } else 
if (25
isHexDigit(*s)25
&&
1370
25
               
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))3
) {
1371
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1372
2
                                                  LangOpts),
1373
2
                   diag::err_invalid_digit)
1374
2
          << StringRef(s, 1) << 2;
1375
2
      hadError = true;
1376
2
    }
1377
    // Other suffixes will be diagnosed by the caller.
1378
102
    return;
1379
102
  }
1380
1381
  // For now, the radix is set to 8. If we discover that we have a
1382
  // floating point constant, the radix will change to 10. Octal floating
1383
  // point constants are not permitted (only decimal and hexadecimal).
1384
318k
  radix = 8;
1385
318k
  const char *PossibleNewDigitStart = s;
1386
318k
  s = SkipOctalDigits(s);
1387
  // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1388
  // as the start of the digits. So if skipping octal digits does not skip
1389
  // anything, we leave the digit start where it was.
1390
318k
  if (s != PossibleNewDigitStart)
1391
2.19k
    DigitsBegin = PossibleNewDigitStart;
1392
1393
318k
  if (s == ThisTokEnd)
1394
267k
    return; // Done, simple octal number like 01234
1395
1396
  // If we have some other non-octal digit that *is* a decimal digit, see if
1397
  // this is part of a floating point number like 094.123 or 09e1.
1398
50.4k
  if (isDigit(*s)) {
1399
3
    const char *EndDecimal = SkipDigits(s);
1400
3
    if (EndDecimal[0] == '.' || 
EndDecimal[0] == 'e'2
||
EndDecimal[0] == 'E'2
) {
1401
1
      s = EndDecimal;
1402
1
      radix = 10;
1403
1
    }
1404
3
  }
1405
1406
50.4k
  ParseDecimalOrOctalCommon(TokLoc);
1407
50.4k
}
1408
1409
7.76M
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1410
7.76M
  switch (Radix) {
1411
99
  case 2:
1412
99
    return NumDigits <= 64;
1413
276k
  case 8:
1414
276k
    return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1415
6.46M
  case 10:
1416
6.46M
    return NumDigits <= 19; // floor(log10(2^64))
1417
1.02M
  case 16:
1418
1.02M
    return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1419
0
  default:
1420
0
    llvm_unreachable("impossible Radix");
1421
7.76M
  }
1422
7.76M
}
1423
1424
/// GetIntegerValue - Convert this numeric literal value to an APInt that
1425
/// matches Val's input width.  If there is an overflow, set Val to the low bits
1426
/// of the result and return true.  Otherwise, return false.
1427
7.76M
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1428
  // Fast path: Compute a conservative bound on the maximum number of
1429
  // bits per digit in this radix. If we can't possibly overflow a
1430
  // uint64 based on that bound then do the simple conversion to
1431
  // integer. This avoids the expensive overflow checking below, and
1432
  // handles the common cases that matter (small decimal integers and
1433
  // hex/octal values which don't overflow).
1434
7.76M
  const unsigned NumDigits = SuffixBegin - DigitsBegin;
1435
7.76M
  if (alwaysFitsInto64Bits(radix, NumDigits)) {
1436
7.76M
    uint64_t N = 0;
1437
31.1M
    for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; 
++Ptr23.4M
)
1438
23.4M
      if (!isDigitSeparator(*Ptr))
1439
23.4M
        N = N * radix + llvm::hexDigitValue(*Ptr);
1440
1441
    // This will truncate the value to Val's input width. Simply check
1442
    // for overflow by comparing.
1443
7.76M
    Val = N;
1444
7.76M
    return Val.getZExtValue() != N;
1445
7.76M
  }
1446
1447
605
  Val = 0;
1448
605
  const char *Ptr = DigitsBegin;
1449
1450
605
  llvm::APInt RadixVal(Val.getBitWidth(), radix);
1451
605
  llvm::APInt CharVal(Val.getBitWidth(), 0);
1452
605
  llvm::APInt OldVal = Val;
1453
1454
605
  bool OverflowOccurred = false;
1455
12.8k
  while (Ptr < SuffixBegin) {
1456
12.1k
    if (isDigitSeparator(*Ptr)) {
1457
74
      ++Ptr;
1458
74
      continue;
1459
74
    }
1460
1461
12.1k
    unsigned C = llvm::hexDigitValue(*Ptr++);
1462
1463
    // If this letter is out of bound for this radix, reject it.
1464
12.1k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1465
1466
12.1k
    CharVal = C;
1467
1468
    // Add the digit to the value in the appropriate radix.  If adding in digits
1469
    // made the value smaller, then this overflowed.
1470
12.1k
    OldVal = Val;
1471
1472
    // Multiply by radix, did overflow occur on the multiply?
1473
12.1k
    Val *= RadixVal;
1474
12.1k
    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1475
1476
    // Add value, did overflow occur on the value?
1477
    //   (a + b) ult b  <=> overflow
1478
12.1k
    Val += CharVal;
1479
12.1k
    OverflowOccurred |= Val.ult(CharVal);
1480
12.1k
  }
1481
605
  return OverflowOccurred;
1482
605
}
1483
1484
llvm::APFloat::opStatus
1485
71.0k
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1486
71.0k
  using llvm::APFloat;
1487
1488
71.0k
  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1489
1490
71.0k
  llvm::SmallString<16> Buffer;
1491
71.0k
  StringRef Str(ThisTokBegin, n);
1492
71.0k
  if (Str.contains('\'')) {
1493
6
    Buffer.reserve(n);
1494
6
    std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1495
6
                        &isDigitSeparator);
1496
6
    Str = Buffer;
1497
6
  }
1498
1499
71.0k
  auto StatusOrErr =
1500
71.0k
      Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1501
71.0k
  assert(StatusOrErr && "Invalid floating point representation");
1502
71.0k
  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1503
71.0k
                                               : 
APFloat::opInvalidOp0
;
1504
71.0k
}
1505
1506
276
static inline bool IsExponentPart(char c) {
1507
276
  return c == 'p' || 
c == 'P'238
||
c == 'e'234
||
c == 'E'213
;
1508
276
}
1509
1510
808
bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1511
808
  assert(radix == 16 || radix == 10);
1512
1513
  // Find how many digits are needed to store the whole literal.
1514
808
  unsigned NumDigits = SuffixBegin - DigitsBegin;
1515
808
  if (saw_period) 
--NumDigits775
;
1516
1517
  // Initial scan of the exponent if it exists
1518
808
  bool ExpOverflowOccurred = false;
1519
808
  bool NegativeExponent = false;
1520
808
  const char *ExponentBegin;
1521
808
  uint64_t Exponent = 0;
1522
808
  int64_t BaseShift = 0;
1523
808
  if (saw_exponent) {
1524
67
    const char *Ptr = DigitsBegin;
1525
1526
276
    while (!IsExponentPart(*Ptr)) 
++Ptr209
;
1527
67
    ExponentBegin = Ptr;
1528
67
    ++Ptr;
1529
67
    NegativeExponent = *Ptr == '-';
1530
67
    if (NegativeExponent) 
++Ptr26
;
1531
1532
67
    unsigned NumExpDigits = SuffixBegin - Ptr;
1533
67
    if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1534
66
      llvm::StringRef ExpStr(Ptr, NumExpDigits);
1535
66
      llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1536
66
      Exponent = ExpInt.getZExtValue();
1537
66
    } else {
1538
1
      ExpOverflowOccurred = true;
1539
1
    }
1540
1541
67
    if (NegativeExponent) 
BaseShift -= Exponent26
;
1542
41
    else BaseShift += Exponent;
1543
67
  }
1544
1545
  // Number of bits needed for decimal literal is
1546
  //   ceil(NumDigits * log2(10))       Integral part
1547
  // + Scale                            Fractional part
1548
  // + ceil(Exponent * log2(10))        Exponent
1549
  // --------------------------------------------------
1550
  //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1551
  //
1552
  // But for simplicity in handling integers, we can round up log2(10) to 4,
1553
  // making:
1554
  // 4 * (NumDigits + Exponent) + Scale
1555
  //
1556
  // Number of digits needed for hexadecimal literal is
1557
  //   4 * NumDigits                    Integral part
1558
  // + Scale                            Fractional part
1559
  // + Exponent                         Exponent
1560
  // --------------------------------------------------
1561
  //   (4 * NumDigits) + Scale + Exponent
1562
808
  uint64_t NumBitsNeeded;
1563
808
  if (radix == 10)
1564
766
    NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1565
42
  else
1566
42
    NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1567
1568
808
  if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1569
0
    ExpOverflowOccurred = true;
1570
808
  llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1571
1572
808
  bool FoundDecimal = false;
1573
1574
808
  int64_t FractBaseShift = 0;
1575
808
  const char *End = saw_exponent ? 
ExponentBegin67
:
SuffixBegin741
;
1576
4.36k
  for (const char *Ptr = DigitsBegin; Ptr < End; 
++Ptr3.55k
) {
1577
3.55k
    if (*Ptr == '.') {
1578
775
      FoundDecimal = true;
1579
775
      continue;
1580
775
    }
1581
1582
    // Normal reading of an integer
1583
2.77k
    unsigned C = llvm::hexDigitValue(*Ptr);
1584
2.77k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1585
1586
2.77k
    Val *= radix;
1587
2.77k
    Val += C;
1588
1589
2.77k
    if (FoundDecimal)
1590
      // Keep track of how much we will need to adjust this value by from the
1591
      // number of digits past the radix point.
1592
1.63k
      --FractBaseShift;
1593
2.77k
  }
1594
1595
  // For a radix of 16, we will be multiplying by 2 instead of 16.
1596
808
  if (radix == 16) 
FractBaseShift *= 442
;
1597
808
  BaseShift += FractBaseShift;
1598
1599
808
  Val <<= Scale;
1600
1601
808
  uint64_t Base = (radix == 16) ? 
242
:
10766
;
1602
808
  if (BaseShift > 0) {
1603
146
    for (int64_t i = 0; i < BaseShift; 
++i136
) {
1604
136
      Val *= Base;
1605
136
    }
1606
798
  } else if (BaseShift < 0) {
1607
2.80k
    for (int64_t i = BaseShift; i < 0 && 
!Val.isZero()2.05k
;
++i2.02k
)
1608
2.02k
      Val = Val.udiv(Base);
1609
781
  }
1610
1611
808
  bool IntOverflowOccurred = false;
1612
808
  auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1613
808
  if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1614
413
    IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1615
413
    StoreVal = Val.trunc(StoreVal.getBitWidth());
1616
413
  } else 
if (395
Val.getBitWidth() < StoreVal.getBitWidth()395
) {
1617
356
    IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1618
356
    StoreVal = Val.zext(StoreVal.getBitWidth());
1619
356
  } else {
1620
39
    StoreVal = Val;
1621
39
  }
1622
1623
808
  return IntOverflowOccurred || 
ExpOverflowOccurred778
;
1624
808
}
1625
1626
/// \verbatim
1627
///       user-defined-character-literal: [C++11 lex.ext]
1628
///         character-literal ud-suffix
1629
///       ud-suffix:
1630
///         identifier
1631
///       character-literal: [C++11 lex.ccon]
1632
///         ' c-char-sequence '
1633
///         u' c-char-sequence '
1634
///         U' c-char-sequence '
1635
///         L' c-char-sequence '
1636
///         u8' c-char-sequence ' [C++1z lex.ccon]
1637
///       c-char-sequence:
1638
///         c-char
1639
///         c-char-sequence c-char
1640
///       c-char:
1641
///         any member of the source character set except the single-quote ',
1642
///           backslash \, or new-line character
1643
///         escape-sequence
1644
///         universal-character-name
1645
///       escape-sequence:
1646
///         simple-escape-sequence
1647
///         octal-escape-sequence
1648
///         hexadecimal-escape-sequence
1649
///       simple-escape-sequence:
1650
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1651
///       octal-escape-sequence:
1652
///         \ octal-digit
1653
///         \ octal-digit octal-digit
1654
///         \ octal-digit octal-digit octal-digit
1655
///       hexadecimal-escape-sequence:
1656
///         \x hexadecimal-digit
1657
///         hexadecimal-escape-sequence hexadecimal-digit
1658
///       universal-character-name: [C++11 lex.charset]
1659
///         \u hex-quad
1660
///         \U hex-quad hex-quad
1661
///       hex-quad:
1662
///         hex-digit hex-digit hex-digit hex-digit
1663
/// \endverbatim
1664
///
1665
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1666
                                     SourceLocation Loc, Preprocessor &PP,
1667
624k
                                     tok::TokenKind kind) {
1668
  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1669
624k
  HadError = false;
1670
1671
624k
  Kind = kind;
1672
1673
624k
  const char *TokBegin = begin;
1674
1675
  // Skip over wide character determinant.
1676
624k
  if (Kind != tok::char_constant)
1677
2.30k
    ++begin;
1678
624k
  if (Kind == tok::utf8_char_constant)
1679
197
    ++begin;
1680
1681
  // Skip over the entry quote.
1682
624k
  if (begin[0] != '\'') {
1683
0
    PP.Diag(Loc, diag::err_lexing_char);
1684
0
    HadError = true;
1685
0
    return;
1686
0
  }
1687
1688
624k
  ++begin;
1689
1690
  // Remove an optional ud-suffix.
1691
624k
  if (end[-1] != '\'') {
1692
61
    const char *UDSuffixEnd = end;
1693
220
    do {
1694
220
      --end;
1695
220
    } while (end[-1] != '\'');
1696
    // FIXME: Don't bother with this if !tok.hasUCN().
1697
61
    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1698
61
    UDSuffixOffset = end - TokBegin;
1699
61
  }
1700
1701
  // Trim the ending quote.
1702
624k
  assert(end != begin && "Invalid token lexed");
1703
624k
  --end;
1704
1705
  // FIXME: The "Value" is an uint64_t so we can handle char literals of
1706
  // up to 64-bits.
1707
  // FIXME: This extensively assumes that 'char' is 8-bits.
1708
624k
  assert(PP.getTargetInfo().getCharWidth() == 8 &&
1709
624k
         "Assumes char is 8 bits");
1710
624k
  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1711
624k
         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1712
624k
         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1713
624k
  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1714
624k
         "Assumes sizeof(wchar) on target is <= 64");
1715
1716
624k
  SmallVector<uint32_t, 4> codepoint_buffer;
1717
624k
  codepoint_buffer.resize(end - begin);
1718
624k
  uint32_t *buffer_begin = &codepoint_buffer.front();
1719
624k
  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1720
1721
  // Unicode escapes representing characters that cannot be correctly
1722
  // represented in a single code unit are disallowed in character literals
1723
  // by this implementation.
1724
624k
  uint32_t largest_character_for_kind;
1725
624k
  if (tok::wide_char_constant == Kind) {
1726
1.77k
    largest_character_for_kind =
1727
1.77k
        0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1728
622k
  } else if (tok::utf8_char_constant == Kind) {
1729
197
    largest_character_for_kind = 0x7F;
1730
622k
  } else if (tok::utf16_char_constant == Kind) {
1731
162
    largest_character_for_kind = 0xFFFF;
1732
622k
  } else if (tok::utf32_char_constant == Kind) {
1733
164
    largest_character_for_kind = 0x10FFFF;
1734
622k
  } else {
1735
622k
    largest_character_for_kind = 0x7Fu;
1736
622k
  }
1737
1738
1.24M
  while (begin != end) {
1739
    // Is this a span of non-escape characters?
1740
625k
    if (begin[0] != '\\') {
1741
617k
      char const *start = begin;
1742
2.21M
      do {
1743
2.21M
        ++begin;
1744
2.21M
      } while (begin != end && 
*begin != '\\'1.59M
);
1745
1746
617k
      char const *tmp_in_start = start;
1747
617k
      uint32_t *tmp_out_start = buffer_begin;
1748
617k
      llvm::ConversionResult res =
1749
617k
          llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1750
617k
                             reinterpret_cast<llvm::UTF8 const *>(begin),
1751
617k
                             &buffer_begin, buffer_end, llvm::strictConversion);
1752
617k
      if (res != llvm::conversionOK) {
1753
        // If we see bad encoding for unprefixed character literals, warn and
1754
        // simply copy the byte values, for compatibility with gcc and
1755
        // older versions of clang.
1756
6
        bool NoErrorOnBadEncoding = isOrdinary();
1757
6
        unsigned Msg = diag::err_bad_character_encoding;
1758
6
        if (NoErrorOnBadEncoding)
1759
3
          Msg = diag::warn_bad_character_encoding;
1760
6
        PP.Diag(Loc, Msg);
1761
6
        if (NoErrorOnBadEncoding) {
1762
3
          start = tmp_in_start;
1763
3
          buffer_begin = tmp_out_start;
1764
7
          for (; start != begin; 
++start, ++buffer_begin4
)
1765
4
            *buffer_begin = static_cast<uint8_t>(*start);
1766
3
        } else {
1767
3
          HadError = true;
1768
3
        }
1769
617k
      } else {
1770
2.83M
        for (; tmp_out_start < buffer_begin; 
++tmp_out_start2.21M
) {
1771
2.21M
          if (*tmp_out_start > largest_character_for_kind) {
1772
38
            HadError = true;
1773
38
            PP.Diag(Loc, diag::err_character_too_large);
1774
38
          }
1775
2.21M
        }
1776
617k
      }
1777
1778
617k
      continue;
1779
617k
    }
1780
    // Is this a Universal Character Name escape?
1781
8.06k
    if (begin[1] == 'u' || 
begin[1] == 'U'7.89k
||
begin[1] == 'N'7.81k
) {
1782
333
      unsigned short UcnLen = 0;
1783
333
      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1784
333
                            FullSourceLoc(Loc, PP.getSourceManager()),
1785
333
                            &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1786
101
        HadError = true;
1787
232
      } else if (*buffer_begin > largest_character_for_kind) {
1788
47
        HadError = true;
1789
47
        PP.Diag(Loc, diag::err_character_too_large);
1790
47
      }
1791
1792
333
      ++buffer_begin;
1793
333
      continue;
1794
333
    }
1795
7.73k
    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1796
7.73k
    uint64_t result =
1797
7.73k
        ProcessCharEscape(TokBegin, begin, end, HadError,
1798
7.73k
                          FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1799
7.73k
                          &PP.getDiagnostics(), PP.getLangOpts(),
1800
7.73k
                          StringLiteralEvalMethod::Evaluated);
1801
7.73k
    *buffer_begin++ = result;
1802
7.73k
  }
1803
1804
624k
  unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1805
1806
624k
  if (NumCharsSoFar > 1) {
1807
532k
    if (isOrdinary() && 
NumCharsSoFar == 4532k
)
1808
532k
      PP.Diag(Loc, diag::warn_four_char_character_literal);
1809
88
    else if (isOrdinary())
1810
52
      PP.Diag(Loc, diag::warn_multichar_character_literal);
1811
36
    else {
1812
36
      PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 
011
:
125
);
1813
36
      HadError = true;
1814
36
    }
1815
532k
    IsMultiChar = true;
1816
532k
  } else {
1817
92.2k
    IsMultiChar = false;
1818
92.2k
  }
1819
1820
624k
  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1821
1822
  // Narrow character literals act as though their value is concatenated
1823
  // in this implementation, but warn on overflow.
1824
624k
  bool multi_char_too_long = false;
1825
624k
  if (isOrdinary() && 
isMultiChar()622k
) {
1826
532k
    LitVal = 0;
1827
2.66M
    for (size_t i = 0; i < NumCharsSoFar; 
++i2.12M
) {
1828
      // check for enough leading zeros to shift into
1829
2.12M
      multi_char_too_long |= (LitVal.countl_zero() < 8);
1830
2.12M
      LitVal <<= 8;
1831
2.12M
      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1832
2.12M
    }
1833
532k
  } else 
if (92.3k
NumCharsSoFar > 092.3k
) {
1834
    // otherwise just take the last character
1835
92.2k
    LitVal = buffer_begin[-1];
1836
92.2k
  }
1837
1838
624k
  if (!HadError && 
multi_char_too_long624k
) {
1839
3
    PP.Diag(Loc, diag::warn_char_constant_too_large);
1840
3
  }
1841
1842
  // Transfer the value from APInt to uint64_t
1843
624k
  Value = LitVal.getZExtValue();
1844
1845
  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1846
  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1847
  // character constants are not sign extended in the this implementation:
1848
  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1849
624k
  if (isOrdinary() && 
NumCharsSoFar == 1622k
&&
(Value & 128)90.0k
&&
1850
624k
      
PP.getLangOpts().CharIsSigned150
)
1851
127
    Value = (signed char)Value;
1852
624k
}
1853
1854
/// \verbatim
1855
///       string-literal: [C++0x lex.string]
1856
///         encoding-prefix " [s-char-sequence] "
1857
///         encoding-prefix R raw-string
1858
///       encoding-prefix:
1859
///         u8
1860
///         u
1861
///         U
1862
///         L
1863
///       s-char-sequence:
1864
///         s-char
1865
///         s-char-sequence s-char
1866
///       s-char:
1867
///         any member of the source character set except the double-quote ",
1868
///           backslash \, or new-line character
1869
///         escape-sequence
1870
///         universal-character-name
1871
///       raw-string:
1872
///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1873
///       r-char-sequence:
1874
///         r-char
1875
///         r-char-sequence r-char
1876
///       r-char:
1877
///         any member of the source character set, except a right parenthesis )
1878
///           followed by the initial d-char-sequence (which may be empty)
1879
///           followed by a double quote ".
1880
///       d-char-sequence:
1881
///         d-char
1882
///         d-char-sequence d-char
1883
///       d-char:
1884
///         any member of the basic source character set except:
1885
///           space, the left parenthesis (, the right parenthesis ),
1886
///           the backslash \, and the control characters representing horizontal
1887
///           tab, vertical tab, form feed, and newline.
1888
///       escape-sequence: [C++0x lex.ccon]
1889
///         simple-escape-sequence
1890
///         octal-escape-sequence
1891
///         hexadecimal-escape-sequence
1892
///       simple-escape-sequence:
1893
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1894
///       octal-escape-sequence:
1895
///         \ octal-digit
1896
///         \ octal-digit octal-digit
1897
///         \ octal-digit octal-digit octal-digit
1898
///       hexadecimal-escape-sequence:
1899
///         \x hexadecimal-digit
1900
///         hexadecimal-escape-sequence hexadecimal-digit
1901
///       universal-character-name:
1902
///         \u hex-quad
1903
///         \U hex-quad hex-quad
1904
///       hex-quad:
1905
///         hex-digit hex-digit hex-digit hex-digit
1906
/// \endverbatim
1907
///
1908
StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1909
                                         Preprocessor &PP,
1910
                                         StringLiteralEvalMethod EvalMethod)
1911
    : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1912
      Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1913
      MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1914
      ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1915
8.06M
      Pascal(false) {
1916
8.06M
  init(StringToks);
1917
8.06M
}
1918
1919
10.1M
void StringLiteralParser::init(ArrayRef<Token> StringToks){
1920
  // The literal token may have come from an invalid source location (e.g. due
1921
  // to a PCH error), in which case the token length will be 0.
1922
10.1M
  if (StringToks.empty() || 
StringToks[0].getLength() < 210.1M
)
1923
0
    return DiagnoseLexingError(SourceLocation());
1924
1925
  // Scan all of the string portions, remember the max individual token length,
1926
  // computing a bound on the concatenated string length, and see whether any
1927
  // piece is a wide-string.  If any of the string portions is a wide-string
1928
  // literal, the result is a wide-string literal [C99 6.4.5p4].
1929
10.1M
  assert(!StringToks.empty() && "expected at least one token");
1930
10.1M
  MaxTokenLength = StringToks[0].getLength();
1931
10.1M
  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1932
10.1M
  SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1933
10.1M
  hadError = false;
1934
1935
  // Determines the kind of string from the prefix
1936
10.1M
  Kind = tok::string_literal;
1937
1938
  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1939
10.2M
  for (const Token &Tok : StringToks) {
1940
10.2M
    if (Tok.getLength() < 2)
1941
0
      return DiagnoseLexingError(Tok.getLocation());
1942
1943
    // The string could be shorter than this if it needs cleaning, but this is a
1944
    // reasonable bound, which is all we need.
1945
10.2M
    assert(Tok.getLength() >= 2 && "literal token is invalid!");
1946
10.2M
    SizeBound += Tok.getLength() - 2; // -2 for "".
1947
1948
    // Remember maximum string piece length.
1949
10.2M
    if (Tok.getLength() > MaxTokenLength)
1950
143k
      MaxTokenLength = Tok.getLength();
1951
1952
    // Remember if we see any wide or utf-8/16/32 strings.
1953
    // Also check for illegal concatenations.
1954
10.2M
    if (isUnevaluated() && 
Tok.getKind() != tok::string_literal4.85M
) {
1955
52
      if (Diags) {
1956
52
        SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1957
52
            Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
1958
52
            Features);
1959
52
        CharSourceRange Range =
1960
52
            CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
1961
52
        StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
1962
52
                         getEncodingPrefixLen(Tok.getKind()));
1963
52
        Diags->Report(Tok.getLocation(),
1964
52
                      Features.CPlusPlus26
1965
52
                          ? 
diag::err_unevaluated_string_prefix12
1966
52
                          : 
diag::warn_unevaluated_string_prefix40
)
1967
52
            << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
1968
52
      }
1969
52
      if (Features.CPlusPlus26)
1970
12
        hadError = true;
1971
10.2M
    } else if (Tok.isNot(Kind) && 
Tok.isNot(tok::string_literal)2.08k
) {
1972
2.05k
      if (isOrdinary()) {
1973
2.01k
        Kind = Tok.getKind();
1974
2.01k
      } else {
1975
40
        if (Diags)
1976
40
          Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
1977
40
        hadError = true;
1978
40
      }
1979
2.05k
    }
1980
10.2M
  }
1981
1982
  // Include space for the null terminator.
1983
10.1M
  ++SizeBound;
1984
1985
  // TODO: K&R warning: "traditional C rejects string constant concatenation"
1986
1987
  // Get the width in bytes of char/wchar_t/char16_t/char32_t
1988
10.1M
  CharByteWidth = getCharWidth(Kind, Target);
1989
10.1M
  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1990
10.1M
  CharByteWidth /= 8;
1991
1992
  // The output buffer size needs to be large enough to hold wide characters.
1993
  // This is a worst-case assumption which basically corresponds to L"" "long".
1994
10.1M
  SizeBound *= CharByteWidth;
1995
1996
  // Size the temporary buffer to hold the result string data.
1997
10.1M
  ResultBuf.resize(SizeBound);
1998
1999
  // Likewise, but for each string piece.
2000
10.1M
  SmallString<512> TokenBuf;
2001
10.1M
  TokenBuf.resize(MaxTokenLength);
2002
2003
  // Loop over all the strings, getting their spelling, and expanding them to
2004
  // wide strings as appropriate.
2005
10.1M
  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
2006
2007
10.1M
  Pascal = false;
2008
2009
10.1M
  SourceLocation UDSuffixTokLoc;
2010
2011
20.4M
  for (unsigned i = 0, e = StringToks.size(); i != e; 
++i10.2M
) {
2012
10.2M
    const char *ThisTokBuf = &TokenBuf[0];
2013
    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
2014
    // that ThisTokBuf points to a buffer that is big enough for the whole token
2015
    // and 'spelled' tokens can only shrink.
2016
10.2M
    bool StringInvalid = false;
2017
10.2M
    unsigned ThisTokLen =
2018
10.2M
      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2019
10.2M
                         &StringInvalid);
2020
10.2M
    if (StringInvalid)
2021
0
      return DiagnoseLexingError(StringToks[i].getLocation());
2022
2023
10.2M
    const char *ThisTokBegin = ThisTokBuf;
2024
10.2M
    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2025
2026
    // Remove an optional ud-suffix.
2027
10.2M
    if (ThisTokEnd[-1] != '"') {
2028
1.00k
      const char *UDSuffixEnd = ThisTokEnd;
2029
2.34k
      do {
2030
2.34k
        --ThisTokEnd;
2031
2.34k
      } while (ThisTokEnd[-1] != '"');
2032
2033
1.00k
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2034
2035
1.00k
      if (UDSuffixBuf.empty()) {
2036
983
        if (StringToks[i].hasUCN())
2037
10
          expandUCNs(UDSuffixBuf, UDSuffix);
2038
973
        else
2039
973
          UDSuffixBuf.assign(UDSuffix);
2040
983
        UDSuffixToken = i;
2041
983
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2042
983
        UDSuffixTokLoc = StringToks[i].getLocation();
2043
983
      } else {
2044
18
        SmallString<32> ExpandedUDSuffix;
2045
18
        if (StringToks[i].hasUCN()) {
2046
9
          expandUCNs(ExpandedUDSuffix, UDSuffix);
2047
9
          UDSuffix = ExpandedUDSuffix;
2048
9
        }
2049
2050
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2051
        // result of a concatenation involving at least one user-defined-string-
2052
        // literal, all the participating user-defined-string-literals shall
2053
        // have the same ud-suffix.
2054
18
        bool UnevaluatedStringHasUDL = isUnevaluated() && 
!UDSuffix.empty()0
;
2055
18
        if (UDSuffixBuf != UDSuffix || 
UnevaluatedStringHasUDL12
) {
2056
6
          if (Diags) {
2057
6
            SourceLocation TokLoc = StringToks[i].getLocation();
2058
6
            if (UnevaluatedStringHasUDL) {
2059
0
              Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2060
0
                  << SourceRange(TokLoc, TokLoc);
2061
6
            } else {
2062
6
              Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2063
6
                  << UDSuffixBuf << UDSuffix
2064
6
                  << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2065
6
            }
2066
6
          }
2067
6
          hadError = true;
2068
6
        }
2069
18
      }
2070
1.00k
    }
2071
2072
    // Strip the end quote.
2073
10.2M
    --ThisTokEnd;
2074
2075
    // TODO: Input character set mapping support.
2076
2077
    // Skip marker for wide or unicode strings.
2078
10.2M
    if (ThisTokBuf[0] == 'L' || 
ThisTokBuf[0] == 'u'10.2M
||
ThisTokBuf[0] == 'U'10.2M
) {
2079
2.13k
      ++ThisTokBuf;
2080
      // Skip 8 of u8 marker for utf8 strings.
2081
2.13k
      if (ThisTokBuf[0] == '8')
2082
480
        ++ThisTokBuf;
2083
2.13k
    }
2084
2085
    // Check for raw string
2086
10.2M
    if (ThisTokBuf[0] == 'R') {
2087
139
      if (ThisTokBuf[1] != '"') {
2088
        // The file may have come from PCH and then changed after loading the
2089
        // PCH; Fail gracefully.
2090
0
        return DiagnoseLexingError(StringToks[i].getLocation());
2091
0
      }
2092
139
      ThisTokBuf += 2; // skip R"
2093
2094
      // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2095
      // characters.
2096
139
      constexpr unsigned MaxRawStrDelimLen = 16;
2097
2098
139
      const char *Prefix = ThisTokBuf;
2099
354
      while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2100
354
             ThisTokBuf[0] != '(')
2101
215
        ++ThisTokBuf;
2102
139
      if (ThisTokBuf[0] != '(')
2103
0
        return DiagnoseLexingError(StringToks[i].getLocation());
2104
139
      ++ThisTokBuf; // skip '('
2105
2106
      // Remove same number of characters from the end
2107
139
      ThisTokEnd -= ThisTokBuf - Prefix;
2108
139
      if (ThisTokEnd < ThisTokBuf)
2109
0
        return DiagnoseLexingError(StringToks[i].getLocation());
2110
2111
      // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2112
      // results in a new-line in the resulting execution string-literal.
2113
139
      StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2114
275
      while (!RemainingTokenSpan.empty()) {
2115
        // Split the string literal on \r\n boundaries.
2116
136
        size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2117
136
        StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2118
136
        StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2119
2120
        // Copy everything before the \r\n sequence into the string literal.
2121
136
        if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2122
6
          hadError = true;
2123
2124
        // Point into the \n inside the \r\n sequence and operate on the
2125
        // remaining portion of the literal.
2126
136
        RemainingTokenSpan = AfterCRLF.substr(1);
2127
136
      }
2128
10.2M
    } else {
2129
10.2M
      if (ThisTokBuf[0] != '"') {
2130
        // The file may have come from PCH and then changed after loading the
2131
        // PCH; Fail gracefully.
2132
0
        return DiagnoseLexingError(StringToks[i].getLocation());
2133
0
      }
2134
10.2M
      ++ThisTokBuf; // skip "
2135
2136
      // Check if this is a pascal string
2137
10.2M
      if (!isUnevaluated() && 
Features.PascalStrings5.44M
&&
2138
10.2M
          
ThisTokBuf + 1 != ThisTokEnd96
&&
ThisTokBuf[0] == '\\'96
&&
2139
10.2M
          
ThisTokBuf[1] == 'p'15
) {
2140
2141
        // If the \p sequence is found in the first token, we have a pascal string
2142
        // Otherwise, if we already have a pascal string, ignore the first \p
2143
15
        if (i == 0) {
2144
15
          ++ThisTokBuf;
2145
15
          Pascal = true;
2146
15
        } else 
if (0
Pascal0
)
2147
0
          ThisTokBuf += 2;
2148
15
      }
2149
2150
20.5M
      while (ThisTokBuf != ThisTokEnd) {
2151
        // Is this a span of non-escape characters?
2152
10.2M
        if (ThisTokBuf[0] != '\\') {
2153
10.1M
          const char *InStart = ThisTokBuf;
2154
136M
          do {
2155
136M
            ++ThisTokBuf;
2156
136M
          } while (ThisTokBuf != ThisTokEnd && 
ThisTokBuf[0] != '\\'126M
);
2157
2158
          // Copy the character span over.
2159
10.1M
          if (CopyStringFragment(StringToks[i], ThisTokBegin,
2160
10.1M
                                 StringRef(InStart, ThisTokBuf - InStart)))
2161
8
            hadError = true;
2162
10.1M
          continue;
2163
10.1M
        }
2164
        // Is this a Universal Character Name escape?
2165
76.6k
        if (ThisTokBuf[1] == 'u' || 
ThisTokBuf[1] == 'U'76.3k
||
2166
76.6k
            
ThisTokBuf[1] == 'N'76.0k
) {
2167
578
          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2168
578
                          ResultPtr, hadError,
2169
578
                          FullSourceLoc(StringToks[i].getLocation(), SM),
2170
578
                          CharByteWidth, Diags, Features);
2171
578
          continue;
2172
578
        }
2173
        // Otherwise, this is a non-UCN escape character.  Process it.
2174
76.0k
        unsigned ResultChar =
2175
76.0k
            ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2176
76.0k
                              FullSourceLoc(StringToks[i].getLocation(), SM),
2177
76.0k
                              CharByteWidth * 8, Diags, Features, EvalMethod);
2178
2179
76.0k
        if (CharByteWidth == 4) {
2180
          // FIXME: Make the type of the result buffer correct instead of
2181
          // using reinterpret_cast.
2182
262
          llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2183
262
          *ResultWidePtr = ResultChar;
2184
262
          ResultPtr += 4;
2185
75.7k
        } else if (CharByteWidth == 2) {
2186
          // FIXME: Make the type of the result buffer correct instead of
2187
          // using reinterpret_cast.
2188
34
          llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2189
34
          *ResultWidePtr = ResultChar & 0xFFFF;
2190
34
          ResultPtr += 2;
2191
75.7k
        } else {
2192
75.7k
          assert(CharByteWidth == 1 && "Unexpected char width");
2193
75.7k
          *ResultPtr++ = ResultChar & 0xFF;
2194
75.7k
        }
2195
76.0k
      }
2196
10.2M
    }
2197
10.2M
  }
2198
2199
10.1M
  assert((!Pascal || !isUnevaluated()) &&
2200
10.1M
         "Pascal string in unevaluated context");
2201
10.1M
  if (Pascal) {
2202
15
    if (CharByteWidth == 4) {
2203
      // FIXME: Make the type of the result buffer correct instead of
2204
      // using reinterpret_cast.
2205
1
      llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2206
1
      ResultWidePtr[0] = GetNumStringChars() - 1;
2207
14
    } else if (CharByteWidth == 2) {
2208
      // FIXME: Make the type of the result buffer correct instead of
2209
      // using reinterpret_cast.
2210
3
      llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2211
3
      ResultWidePtr[0] = GetNumStringChars() - 1;
2212
11
    } else {
2213
11
      assert(CharByteWidth == 1 && "Unexpected char width");
2214
11
      ResultBuf[0] = GetNumStringChars() - 1;
2215
11
    }
2216
2217
    // Verify that pascal strings aren't too large.
2218
15
    if (GetStringLength() > 256) {
2219
0
      if (Diags)
2220
0
        Diags->Report(StringToks.front().getLocation(),
2221
0
                      diag::err_pascal_string_too_long)
2222
0
          << SourceRange(StringToks.front().getLocation(),
2223
0
                         StringToks.back().getLocation());
2224
0
      hadError = true;
2225
0
      return;
2226
0
    }
2227
10.1M
  } else if (Diags) {
2228
    // Complain if this string literal has too many characters.
2229
8.06M
    unsigned MaxChars = Features.CPlusPlus? 
655365.15M
:
Features.C992.90M
?
40952.90M
:
5094.58k
;
2230
2231
8.06M
    if (GetNumStringChars() > MaxChars)
2232
1
      Diags->Report(StringToks.front().getLocation(),
2233
1
                    diag::ext_string_too_long)
2234
1
        << GetNumStringChars() << MaxChars
2235
1
        << (Features.CPlusPlus ? 
20
: Features.C99 ?
10
: 0)
2236
1
        << SourceRange(StringToks.front().getLocation(),
2237
1
                       StringToks.back().getLocation());
2238
8.06M
  }
2239
10.1M
}
2240
2241
212
static const char *resyncUTF8(const char *Err, const char *End) {
2242
212
  if (Err == End)
2243
0
    return End;
2244
212
  End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2245
239
  while (++Err != End && 
(*Err & 0xC0) == 0x80186
)
2246
27
    ;
2247
212
  return Err;
2248
212
}
2249
2250
/// This function copies from Fragment, which is a sequence of bytes
2251
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
2252
/// Performs widening for multi-byte characters.
2253
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2254
                                             const char *TokBegin,
2255
10.1M
                                             StringRef Fragment) {
2256
10.1M
  const llvm::UTF8 *ErrorPtrTmp;
2257
10.1M
  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2258
10.1M
    return false;
2259
2260
  // If we see bad encoding for unprefixed string literals, warn and
2261
  // simply copy the byte values, for compatibility with gcc and older
2262
  // versions of clang.
2263
26
  bool NoErrorOnBadEncoding = isOrdinary();
2264
26
  if (NoErrorOnBadEncoding) {
2265
20
    memcpy(ResultPtr, Fragment.data(), Fragment.size());
2266
20
    ResultPtr += Fragment.size();
2267
20
  }
2268
2269
34
  if (
Diags26
) {
2270
34
    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2271
2272
34
    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2273
34
    const DiagnosticBuilder &Builder =
2274
34
      Diag(Diags, Features, SourceLoc, TokBegin,
2275
34
           ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2276
34
           NoErrorOnBadEncoding ? 
diag::warn_bad_string_encoding20
2277
34
                                : 
diag::err_bad_string_encoding14
);
2278
2279
34
    const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2280
34
    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2281
2282
    // Decode into a dummy buffer.
2283
34
    SmallString<512> Dummy;
2284
34
    Dummy.reserve(Fragment.size() * CharByteWidth);
2285
34
    char *Ptr = Dummy.data();
2286
2287
178
    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2288
144
      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2289
144
      NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2290
144
      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2291
144
                                     ErrorPtr, NextStart);
2292
144
      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2293
144
    }
2294
34
  }
2295
26
  return !NoErrorOnBadEncoding;
2296
10.1M
}
2297
2298
0
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2299
0
  hadError = true;
2300
0
  if (Diags)
2301
0
    Diags->Report(Loc, diag::err_lexing_string);
2302
0
}
2303
2304
/// getOffsetOfStringByte - This function returns the offset of the
2305
/// specified byte of the string data represented by Token.  This handles
2306
/// advancing over escape sequences in the string.
2307
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2308
31.9k
                                                    unsigned ByteNo) const {
2309
  // Get the spelling of the token.
2310
31.9k
  SmallString<32> SpellingBuffer;
2311
31.9k
  SpellingBuffer.resize(Tok.getLength());
2312
2313
31.9k
  bool StringInvalid = false;
2314
31.9k
  const char *SpellingPtr = &SpellingBuffer[0];
2315
31.9k
  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2316
31.9k
                                       &StringInvalid);
2317
31.9k
  if (StringInvalid)
2318
0
    return 0;
2319
2320
31.9k
  const char *SpellingStart = SpellingPtr;
2321
31.9k
  const char *SpellingEnd = SpellingPtr+TokLen;
2322
2323
  // Handle UTF-8 strings just like narrow strings.
2324
31.9k
  if (SpellingPtr[0] == 'u' && 
SpellingPtr[1] == '8'3
)
2325
3
    SpellingPtr += 2;
2326
2327
31.9k
  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2328
31.9k
         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2329
2330
  // For raw string literals, this is easy.
2331
31.9k
  if (SpellingPtr[0] == 'R') {
2332
6
    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2333
    // Skip 'R"'.
2334
6
    SpellingPtr += 2;
2335
35
    while (*SpellingPtr != '(') {
2336
29
      ++SpellingPtr;
2337
29
      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2338
29
    }
2339
    // Skip '('.
2340
6
    ++SpellingPtr;
2341
6
    return SpellingPtr - SpellingStart + ByteNo;
2342
6
  }
2343
2344
  // Skip over the leading quote
2345
31.9k
  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2346
31.9k
  ++SpellingPtr;
2347
2348
  // Skip over bytes until we find the offset we're looking for.
2349
467k
  while (ByteNo) {
2350
435k
    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2351
2352
    // Step over non-escapes simply.
2353
435k
    if (*SpellingPtr != '\\') {
2354
434k
      ++SpellingPtr;
2355
434k
      --ByteNo;
2356
434k
      continue;
2357
434k
    }
2358
2359
    // Otherwise, this is an escape character.  Advance over it.
2360
846
    bool HadError = false;
2361
846
    if (SpellingPtr[1] == 'u' || 
SpellingPtr[1] == 'U'843
||
2362
846
        
SpellingPtr[1] == 'N'840
) {
2363
6
      const char *EscapePtr = SpellingPtr;
2364
6
      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2365
6
                                      1, Features, HadError);
2366
6
      if (Len > ByteNo) {
2367
        // ByteNo is somewhere within the escape sequence.
2368
6
        SpellingPtr = EscapePtr;
2369
6
        break;
2370
6
      }
2371
0
      ByteNo -= Len;
2372
840
    } else {
2373
840
      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2374
840
                        FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2375
840
                        Diags, Features, StringLiteralEvalMethod::Evaluated);
2376
840
      --ByteNo;
2377
840
    }
2378
840
    assert(!HadError && "This method isn't valid on erroneous strings");
2379
840
  }
2380
2381
31.9k
  return SpellingPtr-SpellingStart;
2382
31.9k
}
2383
2384
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2385
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2386
/// treat it as an invalid suffix.
2387
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2388
989
                                          StringRef Suffix) {
2389
989
  return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2390
989
         
Suffix == "sv"178
;
2391
989
}