Coverage Report

Created: 2022-01-25 06:29

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/LiteralSupport.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the NumericLiteralParser, CharLiteralParser, and
10
// StringLiteralParser interfaces.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "clang/Lex/LiteralSupport.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TargetInfo.h"
19
#include "clang/Lex/LexDiagnostic.h"
20
#include "clang/Lex/Lexer.h"
21
#include "clang/Lex/Preprocessor.h"
22
#include "clang/Lex/Token.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/StringExtras.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include "llvm/Support/ConvertUTF.h"
28
#include "llvm/Support/Error.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include <algorithm>
31
#include <cassert>
32
#include <cstddef>
33
#include <cstdint>
34
#include <cstring>
35
#include <string>
36
37
using namespace clang;
38
39
5.89M
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
40
5.89M
  switch (kind) {
41
0
  default: llvm_unreachable("Unknown token type!");
42
4.47k
  case tok::char_constant:
43
5.88M
  case tok::string_literal:
44
5.88M
  case tok::utf8_char_constant:
45
5.89M
  case tok::utf8_string_literal:
46
5.89M
    return Target.getCharWidth();
47
148
  case tok::wide_char_constant:
48
1.27k
  case tok::wide_string_literal:
49
1.27k
    return Target.getWCharWidth();
50
34
  case tok::utf16_char_constant:
51
186
  case tok::utf16_string_literal:
52
186
    return Target.getChar16Width();
53
35
  case tok::utf32_char_constant:
54
186
  case tok::utf32_string_literal:
55
186
    return Target.getChar32Width();
56
5.89M
  }
57
5.89M
}
58
59
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
60
                                           FullSourceLoc TokLoc,
61
                                           const char *TokBegin,
62
                                           const char *TokRangeBegin,
63
545
                                           const char *TokRangeEnd) {
64
545
  SourceLocation Begin =
65
545
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
66
545
                                   TokLoc.getManager(), Features);
67
545
  SourceLocation End =
68
545
    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
69
545
                                   TokLoc.getManager(), Features);
70
545
  return CharSourceRange::getCharRange(Begin, End);
71
545
}
72
73
/// Produce a diagnostic highlighting some portion of a literal.
74
///
75
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
76
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
77
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
78
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
79
                              const LangOptions &Features, FullSourceLoc TokLoc,
80
                              const char *TokBegin, const char *TokRangeBegin,
81
401
                              const char *TokRangeEnd, unsigned DiagID) {
82
401
  SourceLocation Begin =
83
401
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
84
401
                                   TokLoc.getManager(), Features);
85
401
  return Diags->Report(Begin, DiagID) <<
86
401
    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
87
401
}
88
89
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
90
/// either a character or a string literal.
91
static unsigned ProcessCharEscape(const char *ThisTokBegin,
92
                                  const char *&ThisTokBuf,
93
                                  const char *ThisTokEnd, bool &HadError,
94
                                  FullSourceLoc Loc, unsigned CharWidth,
95
                                  DiagnosticsEngine *Diags,
96
104k
                                  const LangOptions &Features) {
97
104k
  const char *EscapeBegin = ThisTokBuf;
98
104k
  bool Delimited = false;
99
104k
  bool EndDelimiterFound = false;
100
101
  // Skip the '\' char.
102
104k
  ++ThisTokBuf;
103
104
  // We know that this character can't be off the end of the buffer, because
105
  // that would have been \", which would not have been the end of string.
106
104k
  unsigned ResultChar = *ThisTokBuf++;
107
104k
  switch (ResultChar) {
108
  // These map to themselves.
109
978
  
case '\\': 380
case '\'': 413
case '"': 977
case '?': break;
110
111
    // These have fixed mappings.
112
26
  case 'a':
113
    // TODO: K&R: the meaning of '\\a' is different in traditional C
114
26
    ResultChar = 7;
115
26
    break;
116
25
  case 'b':
117
25
    ResultChar = 8;
118
25
    break;
119
15
  case 'e':
120
15
    if (Diags)
121
15
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
122
15
           diag::ext_nonstandard_escape) << "e";
123
15
    ResultChar = 27;
124
15
    break;
125
1
  case 'E':
126
1
    if (Diags)
127
1
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
128
1
           diag::ext_nonstandard_escape) << "E";
129
1
    ResultChar = 27;
130
1
    break;
131
17
  case 'f':
132
17
    ResultChar = 12;
133
17
    break;
134
92.8k
  case 'n':
135
92.8k
    ResultChar = 10;
136
92.8k
    break;
137
63
  case 'r':
138
63
    ResultChar = 13;
139
63
    break;
140
6.10k
  case 't':
141
6.10k
    ResultChar = 9;
142
6.10k
    break;
143
29
  case 'v':
144
29
    ResultChar = 11;
145
29
    break;
146
1.21k
  case 'x': { // Hex escape.
147
1.21k
    ResultChar = 0;
148
1.21k
    if (ThisTokBuf != ThisTokEnd && 
*ThisTokBuf == '{'1.21k
) {
149
56
      Delimited = true;
150
56
      ThisTokBuf++;
151
56
      if (*ThisTokBuf == '}') {
152
4
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
153
4
             diag::err_delimited_escape_empty);
154
4
        return ResultChar;
155
4
      }
156
1.16k
    } else if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)1.15k
) {
157
6
      if (Diags)
158
6
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
159
6
             diag::err_hex_escape_no_digits) << "x";
160
6
      return ResultChar;
161
6
    }
162
163
    // Hex escapes are a maximal series of hex digits.
164
1.20k
    bool Overflow = false;
165
3.97k
    for (; ThisTokBuf != ThisTokEnd; 
++ThisTokBuf2.76k
) {
166
3.10k
      if (Delimited && 
*ThisTokBuf == '}'260
) {
167
36
        ThisTokBuf++;
168
36
        EndDelimiterFound = true;
169
36
        break;
170
36
      }
171
3.06k
      int CharVal = llvm::hexDigitValue(*ThisTokBuf);
172
3.06k
      if (CharVal == -1) {
173
        // Non delimited hex escape sequences stop at the first non-hex digit.
174
328
        if (!Delimited)
175
300
          break;
176
28
        HadError = true;
177
28
        if (Diags)
178
28
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
179
28
               diag::err_delimited_escape_invalid)
180
28
              << StringRef(ThisTokBuf, 1);
181
28
        continue;
182
328
      }
183
      // About to shift out a digit?
184
2.74k
      if (ResultChar & 0xF0000000)
185
6
        Overflow = true;
186
2.74k
      ResultChar <<= 4;
187
2.74k
      ResultChar |= CharVal;
188
2.74k
    }
189
    // See if any bits will be truncated when evaluated as a character.
190
1.20k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 01.10k
) {
191
14
      Overflow = true;
192
14
      ResultChar &= ~0U >> (32-CharWidth);
193
14
    }
194
195
    // Check for overflow.
196
1.20k
    if (!HadError && 
Overflow1.19k
) { // Too many digits to fit in
197
12
      HadError = true;
198
12
      if (Diags)
199
12
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200
12
             diag::err_escape_too_large)
201
12
            << 0;
202
12
    }
203
1.20k
    break;
204
1.21k
  }
205
2.70k
  
case '0': 2.30k
case '1': 2.51k
case '2': 2.69k
case '3':
206
2.70k
  
case '4': 2.70k
case '5': 2.70k
case '6': 2.70k
case '7': {
207
    // Octal escapes.
208
2.70k
    --ThisTokBuf;
209
2.70k
    ResultChar = 0;
210
211
    // Octal escapes are a series of octal digits with maximum length 3.
212
    // "\0123" is a two digit sequence equal to "\012" "3".
213
2.70k
    unsigned NumDigits = 0;
214
2.97k
    do {
215
2.97k
      ResultChar <<= 3;
216
2.97k
      ResultChar |= *ThisTokBuf++ - '0';
217
2.97k
      ++NumDigits;
218
2.97k
    } while (ThisTokBuf != ThisTokEnd && 
NumDigits < 31.84k
&&
219
2.97k
             
ThisTokBuf[0] >= '0'1.82k
&&
ThisTokBuf[0] <= '7'1.76k
);
220
221
    // Check for overflow.  Reject '\777', but not L'\777'.
222
2.70k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 02.51k
) {
223
1
      if (Diags)
224
1
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
225
1
             diag::err_escape_too_large) << 1;
226
1
      ResultChar &= ~0U >> (32-CharWidth);
227
1
    }
228
2.70k
    break;
229
2.70k
  }
230
44
  case 'o': {
231
44
    bool Overflow = false;
232
44
    if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
233
0
      HadError = true;
234
0
      if (Diags)
235
0
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
236
0
             diag::err_delimited_escape_missing_brace);
237
238
0
      break;
239
0
    }
240
44
    ResultChar = 0;
241
44
    Delimited = true;
242
44
    ++ThisTokBuf;
243
44
    if (*ThisTokBuf == '}') {
244
4
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
245
4
           diag::err_delimited_escape_empty);
246
4
      return ResultChar;
247
4
    }
248
249
208
    
while (40
ThisTokBuf != ThisTokEnd) {
250
196
      if (*ThisTokBuf == '}') {
251
28
        EndDelimiterFound = true;
252
28
        ThisTokBuf++;
253
28
        break;
254
28
      }
255
168
      if (*ThisTokBuf < '0' || 
*ThisTokBuf > '7'164
) {
256
24
        HadError = true;
257
24
        if (Diags)
258
24
          Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
259
24
               diag::err_delimited_escape_invalid)
260
24
              << StringRef(ThisTokBuf, 1);
261
24
        ThisTokBuf++;
262
24
        continue;
263
24
      }
264
144
      if (ResultChar & 0x020000000)
265
2
        Overflow = true;
266
267
144
      ResultChar <<= 3;
268
144
      ResultChar |= *ThisTokBuf++ - '0';
269
144
    }
270
    // Check for overflow.  Reject '\777', but not L'\777'.
271
40
    if (!HadError &&
272
40
        
(24
Overflow24
||
(22
CharWidth != 3222
&&
(ResultChar >> CharWidth) != 020
))) {
273
8
      HadError = true;
274
8
      if (Diags)
275
8
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
276
8
             diag::err_escape_too_large)
277
8
            << 1;
278
8
      ResultChar &= ~0U >> (32 - CharWidth);
279
8
    }
280
40
    break;
281
44
  }
282
    // Otherwise, these are not valid escapes.
283
36
  
case '(': 5
case '{': 10
case '[': 15
case '%':
284
    // GCC accepts these as extensions.  We warn about them as such though.
285
36
    if (Diags)
286
24
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287
24
           diag::ext_nonstandard_escape)
288
24
        << std::string(1, ResultChar);
289
36
    break;
290
10
  default:
291
10
    if (!Diags)
292
0
      break;
293
294
10
    if (isPrintable(ResultChar))
295
8
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
296
8
           diag::ext_unknown_escape)
297
8
        << std::string(1, ResultChar);
298
2
    else
299
2
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
300
2
           diag::ext_unknown_escape)
301
2
        << "x" + llvm::utohexstr(ResultChar);
302
10
    break;
303
104k
  }
304
305
104k
  if (Delimited && 
Diags92
) {
306
92
    if (!EndDelimiterFound)
307
28
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
308
28
           diag::err_expected)
309
28
          << tok::r_brace;
310
64
    else if (!HadError) {
311
20
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
312
20
           diag::ext_delimited_escape_sequence);
313
20
    }
314
92
  }
315
316
104k
  return ResultChar;
317
104k
}
318
319
static void appendCodePoint(unsigned Codepoint,
320
184
                            llvm::SmallVectorImpl<char> &Str) {
321
184
  char ResultBuf[4];
322
184
  char *ResultPtr = ResultBuf;
323
184
  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
324
184
  (void)Res;
325
184
  assert(Res && "Unexpected conversion failure");
326
0
  Str.append(ResultBuf, ResultPtr);
327
184
}
328
329
985
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
330
3.69k
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; 
++I2.70k
) {
331
2.70k
    if (*I != '\\') {
332
2.52k
      Buf.push_back(*I);
333
2.52k
      continue;
334
2.52k
    }
335
336
184
    ++I;
337
184
    char Kind = *I;
338
184
    ++I;
339
340
184
    assert(Kind == 'u' || Kind == 'U');
341
0
    uint32_t CodePoint = 0;
342
343
184
    if (Kind == 'u' && 
*I == '{'161
) {
344
20
      for (++I; *I != '}'; 
++I14
) {
345
14
        unsigned Value = llvm::hexDigitValue(*I);
346
14
        assert(Value != -1U);
347
0
        CodePoint <<= 4;
348
14
        CodePoint += Value;
349
14
      }
350
6
      appendCodePoint(CodePoint, Buf);
351
6
      continue;
352
6
    }
353
354
178
    unsigned NumHexDigits;
355
178
    if (Kind == 'u')
356
155
      NumHexDigits = 4;
357
23
    else
358
23
      NumHexDigits = 8;
359
360
178
    assert(I + NumHexDigits <= E);
361
362
982
    for (; NumHexDigits != 0; 
++I, --NumHexDigits804
) {
363
804
      unsigned Value = llvm::hexDigitValue(*I);
364
804
      assert(Value != -1U);
365
366
0
      CodePoint <<= 4;
367
804
      CodePoint += Value;
368
804
    }
369
370
178
    appendCodePoint(CodePoint, Buf);
371
178
    --I;
372
178
  }
373
985
}
374
375
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
376
/// return the UTF32.
377
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
378
                             const char *ThisTokEnd,
379
                             uint32_t &UcnVal, unsigned short &UcnLen,
380
                             FullSourceLoc Loc, DiagnosticsEngine *Diags,
381
                             const LangOptions &Features,
382
442
                             bool in_char_string_literal = false) {
383
442
  const char *UcnBegin = ThisTokBuf;
384
385
  // Skip the '\u' char's.
386
442
  ThisTokBuf += 2;
387
388
442
  bool Delimited = false;
389
442
  bool EndDelimiterFound = false;
390
442
  bool HasError = false;
391
392
442
  if (UcnBegin[1] == 'u' && 
in_char_string_literal350
&&
393
442
      
ThisTokBuf != ThisTokEnd350
&&
*ThisTokBuf == '{'349
) {
394
44
    Delimited = true;
395
44
    ThisTokBuf++;
396
398
  } else if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)395
) {
397
4
    if (Diags)
398
4
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
399
4
           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
400
4
    return false;
401
4
  }
402
438
  UcnLen = (ThisTokBuf[-1] == 'u' ? 
4305
:
8133
);
403
404
438
  bool Overflow = false;
405
438
  unsigned short Count = 0;
406
2.51k
  for (; ThisTokBuf != ThisTokEnd && 
(2.23k
Delimited2.23k
||
Count != UcnLen2.04k
);
407
2.10k
       
++ThisTokBuf2.07k
) {
408
2.10k
    if (Delimited && 
*ThisTokBuf == '}'192
) {
409
32
      ++ThisTokBuf;
410
32
      EndDelimiterFound = true;
411
32
      break;
412
32
    }
413
2.07k
    int CharVal = llvm::hexDigitValue(*ThisTokBuf);
414
2.07k
    if (CharVal == -1) {
415
12
      HasError = true;
416
12
      if (!Delimited)
417
4
        break;
418
8
      if (Diags) {
419
8
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
420
8
             diag::err_delimited_escape_invalid)
421
8
            << StringRef(ThisTokBuf, 1);
422
8
      }
423
8
      Count++;
424
8
      continue;
425
12
    }
426
2.06k
    if (UcnVal & 0xF0000000) {
427
4
      Overflow = true;
428
4
      continue;
429
4
    }
430
2.06k
    UcnVal <<= 4;
431
2.06k
    UcnVal |= CharVal;
432
2.06k
    Count++;
433
2.06k
  }
434
435
438
  if (Overflow) {
436
4
    if (Diags)
437
4
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
438
4
           diag::err_escape_too_large)
439
4
          << 0;
440
4
    return false;
441
4
  }
442
443
434
  if (Delimited && 
!EndDelimiterFound40
) {
444
12
    if (Diags) {
445
12
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
446
12
           diag::err_expected)
447
12
          << tok::r_brace;
448
12
    }
449
12
    return false;
450
12
  }
451
452
  // If we didn't consume the proper number of digits, there is a problem.
453
422
  if (Count == 0 || 
(418
!Delimited418
&&
Count != UcnLen394
)) {
454
10
    if (Diags)
455
10
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
456
10
           Delimited ? 
diag::err_delimited_escape_empty4
457
10
                     : 
diag::err_ucn_escape_incomplete6
);
458
10
    return false;
459
10
  }
460
461
412
  if (HasError)
462
8
    return false;
463
464
  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
465
404
  if ((0xD800 <= UcnVal && 
UcnVal <= 0xDFFF116
) || // surrogate codepoints
466
404
      
UcnVal > 0x10FFFF388
) { // maximum legal UTF32 value
467
19
    if (Diags)
468
19
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
469
19
           diag::err_ucn_escape_invalid);
470
19
    return false;
471
19
  }
472
473
  // C++11 allows UCNs that refer to control characters and basic source
474
  // characters inside character and string literals
475
385
  if (UcnVal < 0xa0 &&
476
385
      
(121
UcnVal != 0x24121
&&
UcnVal != 0x40117
&&
UcnVal != 0x60113
)) { // $, @, `
477
109
    bool IsError = (!Features.CPlusPlus11 || 
!in_char_string_literal74
);
478
109
    if (Diags) {
479
109
      char BasicSCSChar = UcnVal;
480
109
      if (UcnVal >= 0x20 && 
UcnVal < 0x7f81
)
481
50
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
482
50
             IsError ? 
diag::err_ucn_escape_basic_scs18
:
483
50
                       
diag::warn_cxx98_compat_literal_ucn_escape_basic_scs32
)
484
50
            << StringRef(&BasicSCSChar, 1);
485
59
      else
486
59
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
487
59
             IsError ? 
diag::err_ucn_control_character17
:
488
59
                       
diag::warn_cxx98_compat_literal_ucn_control_character42
);
489
109
    }
490
109
    if (IsError)
491
35
      return false;
492
109
  }
493
494
350
  if (!Features.CPlusPlus && 
!Features.C99125
&&
Diags2
)
495
2
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
496
2
         diag::warn_ucn_not_valid_in_c89_literal);
497
498
350
  if (Delimited && 
Diags14
)
499
14
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
500
14
         diag::ext_delimited_escape_sequence);
501
502
350
  return true;
503
385
}
504
505
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
506
/// which this UCN will occupy.
507
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
508
                            const char *ThisTokEnd, unsigned CharByteWidth,
509
6
                            const LangOptions &Features, bool &HadError) {
510
  // UTF-32: 4 bytes per escape.
511
6
  if (CharByteWidth == 4)
512
0
    return 4;
513
514
6
  uint32_t UcnVal = 0;
515
6
  unsigned short UcnLen = 0;
516
6
  FullSourceLoc Loc;
517
518
6
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
519
6
                        UcnLen, Loc, nullptr, Features, true)) {
520
0
    HadError = true;
521
0
    return 0;
522
0
  }
523
524
  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
525
6
  if (CharByteWidth == 2)
526
0
    return UcnVal <= 0xFFFF ? 2 : 4;
527
528
  // UTF-8.
529
6
  if (UcnVal < 0x80)
530
0
    return 1;
531
6
  if (UcnVal < 0x800)
532
0
    return 2;
533
6
  if (UcnVal < 0x10000)
534
3
    return 3;
535
3
  return 4;
536
6
}
537
538
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
539
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
540
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
541
/// we will likely rework our support for UCN's.
542
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
543
                            const char *ThisTokEnd,
544
                            char *&ResultBuf, bool &HadError,
545
                            FullSourceLoc Loc, unsigned CharByteWidth,
546
                            DiagnosticsEngine *Diags,
547
317
                            const LangOptions &Features) {
548
317
  typedef uint32_t UTF32;
549
317
  UTF32 UcnVal = 0;
550
317
  unsigned short UcnLen = 0;
551
317
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
552
317
                        Loc, Diags, Features, true)) {
553
57
    HadError = true;
554
57
    return;
555
57
  }
556
557
260
  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
558
260
         "only character widths of 1, 2, or 4 bytes supported");
559
560
0
  (void)UcnLen;
561
260
  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
562
563
260
  if (CharByteWidth == 4) {
564
    // FIXME: Make the type of the result buffer correct instead of
565
    // using reinterpret_cast.
566
73
    llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
567
73
    *ResultPtr = UcnVal;
568
73
    ResultBuf += 4;
569
73
    return;
570
73
  }
571
572
187
  if (CharByteWidth == 2) {
573
    // FIXME: Make the type of the result buffer correct instead of
574
    // using reinterpret_cast.
575
54
    llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
576
577
54
    if (UcnVal <= (UTF32)0xFFFF) {
578
40
      *ResultPtr = UcnVal;
579
40
      ResultBuf += 2;
580
40
      return;
581
40
    }
582
583
    // Convert to UTF16.
584
14
    UcnVal -= 0x10000;
585
14
    *ResultPtr     = 0xD800 + (UcnVal >> 10);
586
14
    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
587
14
    ResultBuf += 4;
588
14
    return;
589
54
  }
590
591
133
  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
592
593
  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
594
  // The conversion below was inspired by:
595
  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
596
  // First, we determine how many bytes the result will require.
597
0
  typedef uint8_t UTF8;
598
599
133
  unsigned short bytesToWrite = 0;
600
133
  if (UcnVal < (UTF32)0x80)
601
28
    bytesToWrite = 1;
602
105
  else if (UcnVal < (UTF32)0x800)
603
15
    bytesToWrite = 2;
604
90
  else if (UcnVal < (UTF32)0x10000)
605
66
    bytesToWrite = 3;
606
24
  else
607
24
    bytesToWrite = 4;
608
609
133
  const unsigned byteMask = 0xBF;
610
133
  const unsigned byteMark = 0x80;
611
612
  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
613
  // into the first byte, depending on how many bytes follow.
614
133
  static const UTF8 firstByteMark[5] = {
615
133
    0x00, 0x00, 0xC0, 0xE0, 0xF0
616
133
  };
617
  // Finally, we write the bytes into ResultBuf.
618
133
  ResultBuf += bytesToWrite;
619
133
  switch (bytesToWrite) { // note: everything falls through.
620
24
  case 4:
621
24
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
622
24
    LLVM_FALLTHROUGH;
623
90
  case 3:
624
90
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
625
90
    LLVM_FALLTHROUGH;
626
105
  case 2:
627
105
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
628
105
    LLVM_FALLTHROUGH;
629
133
  case 1:
630
133
    *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
631
133
  }
632
  // Update the buffer.
633
133
  ResultBuf += bytesToWrite;
634
133
}
635
636
///       integer-constant: [C99 6.4.4.1]
637
///         decimal-constant integer-suffix
638
///         octal-constant integer-suffix
639
///         hexadecimal-constant integer-suffix
640
///         binary-literal integer-suffix [GNU, C++1y]
641
///       user-defined-integer-literal: [C++11 lex.ext]
642
///         decimal-literal ud-suffix
643
///         octal-literal ud-suffix
644
///         hexadecimal-literal ud-suffix
645
///         binary-literal ud-suffix [GNU, C++1y]
646
///       decimal-constant:
647
///         nonzero-digit
648
///         decimal-constant digit
649
///       octal-constant:
650
///         0
651
///         octal-constant octal-digit
652
///       hexadecimal-constant:
653
///         hexadecimal-prefix hexadecimal-digit
654
///         hexadecimal-constant hexadecimal-digit
655
///       hexadecimal-prefix: one of
656
///         0x 0X
657
///       binary-literal:
658
///         0b binary-digit
659
///         0B binary-digit
660
///         binary-literal binary-digit
661
///       integer-suffix:
662
///         unsigned-suffix [long-suffix]
663
///         unsigned-suffix [long-long-suffix]
664
///         long-suffix [unsigned-suffix]
665
///         long-long-suffix [unsigned-sufix]
666
///       nonzero-digit:
667
///         1 2 3 4 5 6 7 8 9
668
///       octal-digit:
669
///         0 1 2 3 4 5 6 7
670
///       hexadecimal-digit:
671
///         0 1 2 3 4 5 6 7 8 9
672
///         a b c d e f
673
///         A B C D E F
674
///       binary-digit:
675
///         0
676
///         1
677
///       unsigned-suffix: one of
678
///         u U
679
///       long-suffix: one of
680
///         l L
681
///       long-long-suffix: one of
682
///         ll LL
683
///
684
///       floating-constant: [C99 6.4.4.2]
685
///         TODO: add rules...
686
///
687
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
688
                                           SourceLocation TokLoc,
689
                                           const SourceManager &SM,
690
                                           const LangOptions &LangOpts,
691
                                           const TargetInfo &Target,
692
                                           DiagnosticsEngine &Diags)
693
    : SM(SM), LangOpts(LangOpts), Diags(Diags),
694
6.26M
      ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
695
696
6.26M
  s = DigitsBegin = ThisTokBegin;
697
6.26M
  saw_exponent = false;
698
6.26M
  saw_period = false;
699
6.26M
  saw_ud_suffix = false;
700
6.26M
  saw_fixed_point_suffix = false;
701
6.26M
  isLong = false;
702
6.26M
  isUnsigned = false;
703
6.26M
  isLongLong = false;
704
6.26M
  isSizeT = false;
705
6.26M
  isHalf = false;
706
6.26M
  isFloat = false;
707
6.26M
  isImaginary = false;
708
6.26M
  isFloat16 = false;
709
6.26M
  isFloat128 = false;
710
6.26M
  MicrosoftInteger = 0;
711
6.26M
  isFract = false;
712
6.26M
  isAccum = false;
713
6.26M
  hadError = false;
714
715
  // This routine assumes that the range begin/end matches the regex for integer
716
  // and FP constants (specifically, the 'pp-number' regex), and assumes that
717
  // the byte at "*end" is both valid and not part of the regex.  Because of
718
  // this, it doesn't have to check for 'overscan' in various places.
719
6.26M
  if (isPreprocessingNumberBody(*ThisTokEnd)) {
720
0
    Diags.Report(TokLoc, diag::err_lexing_numeric);
721
0
    hadError = true;
722
0
    return;
723
0
  }
724
725
6.26M
  if (*s == '0') { // parse radix
726
1.09M
    ParseNumberStartingWithZero(TokLoc);
727
1.09M
    if (hadError)
728
45
      return;
729
5.17M
  } else { // the first digit is non-zero
730
5.17M
    radix = 10;
731
5.17M
    s = SkipDigits(s);
732
5.17M
    if (s == ThisTokEnd) {
733
      // Done.
734
4.74M
    } else {
735
434k
      ParseDecimalOrOctalCommon(TokLoc);
736
434k
      if (hadError)
737
11
        return;
738
434k
    }
739
5.17M
  }
740
741
6.26M
  SuffixBegin = s;
742
6.26M
  checkSeparator(TokLoc, s, CSK_AfterDigits);
743
744
  // Initial scan to lookahead for fixed point suffix.
745
6.26M
  if (LangOpts.FixedPoint) {
746
1.81k
    for (const char *c = s; c != ThisTokEnd; 
++c886
) {
747
1.74k
      if (*c == 'r' || 
*c == 'k'1.52k
||
*c == 'R'886
||
*c == 'K'886
) {
748
858
        saw_fixed_point_suffix = true;
749
858
        break;
750
858
      }
751
1.74k
    }
752
928
  }
753
754
  // Parse the suffix.  At this point we can classify whether we have an FP or
755
  // integer constant.
756
6.26M
  bool isFixedPointConstant = isFixedPointLiteral();
757
6.26M
  bool isFPConstant = isFloatingLiteral();
758
6.26M
  bool HasSize = false;
759
760
  // Loop over all of the characters of the suffix.  If we see something bad,
761
  // we break out of the loop.
762
6.82M
  for (; s != ThisTokEnd; 
++s560k
) {
763
560k
    switch (*s) {
764
0
    case 'R':
765
224
    case 'r':
766
224
      if (!LangOpts.FixedPoint)
767
6
        break;
768
218
      if (isFract || 
isAccum217
)
break1
;
769
217
      if (!(saw_period || 
saw_exponent20
))
break14
;
770
203
      isFract = true;
771
203
      continue;
772
0
    case 'K':
773
632
    case 'k':
774
632
      if (!LangOpts.FixedPoint)
775
6
        break;
776
626
      if (isFract || 
isAccum625
)
break2
;
777
624
      if (!(saw_period || 
saw_exponent43
))
break16
;
778
608
      isAccum = true;
779
608
      continue;
780
514
    case 'h':      // FP Suffix for "half".
781
516
    case 'H':
782
      // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
783
516
      if (!(LangOpts.Half || 
LangOpts.FixedPoint495
))
784
3
        break;
785
513
      if (isIntegerLiteral()) 
break13
; // Error for integer constant.
786
500
      if (HasSize)
787
3
        break;
788
497
      HasSize = true;
789
497
      isHalf = true;
790
497
      continue;  // Success.
791
4.10k
    case 'f':      // FP Suffix for "float"
792
7.44k
    case 'F':
793
7.44k
      if (!isFPConstant) 
break4
; // Error for integer constant.
794
7.44k
      if (HasSize)
795
1
        break;
796
7.44k
      HasSize = true;
797
798
      // CUDA host and device may have different _Float16 support, therefore
799
      // allows f16 literals to avoid false alarm.
800
      // ToDo: more precise check for CUDA.
801
7.44k
      if ((Target.hasFloat16Type() || 
LangOpts.CUDA6.92k
) &&
s + 2 < ThisTokEnd539
&&
802
7.44k
          
s[1] == '1'154
&&
s[2] == '6'154
) {
803
154
        s += 2; // success, eat up 2 characters.
804
154
        isFloat16 = true;
805
154
        continue;
806
154
      }
807
808
7.29k
      isFloat = true;
809
7.29k
      continue;  // Success.
810
171
    case 'q':    // FP Suffix for "__float128"
811
171
    case 'Q':
812
171
      if (!isFPConstant) 
break1
; // Error for integer constant.
813
170
      if (HasSize)
814
0
        break;
815
170
      HasSize = true;
816
170
      isFloat128 = true;
817
170
      continue;  // Success.
818
17.4k
    case 'u':
819
106k
    case 'U':
820
106k
      if (isFPConstant) 
break8
; // Error for floating constant.
821
106k
      if (isUnsigned) 
break0
; // Cannot be repeated.
822
106k
      isUnsigned = true;
823
106k
      continue;  // Success.
824
5.87k
    case 'l':
825
444k
    case 'L':
826
444k
      if (HasSize)
827
16
        break;
828
444k
      HasSize = true;
829
830
      // Check for long long.  The L's need to be adjacent and the same case.
831
444k
      if (s[1] == s[0]) {
832
44.1k
        assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
833
44.1k
        if (isFPConstant) 
break0
; // long long invalid for floats.
834
44.1k
        isLongLong = true;
835
44.1k
        ++s;  // Eat both of them.
836
399k
      } else {
837
399k
        isLong = true;
838
399k
      }
839
444k
      continue; // Success.
840
444k
    case 'z':
841
150
    case 'Z':
842
150
      if (isFPConstant)
843
12
        break; // Invalid for floats.
844
138
      if (HasSize)
845
20
        break;
846
118
      HasSize = true;
847
118
      isSizeT = true;
848
118
      continue;
849
307
    case 'i':
850
312
    case 'I':
851
312
      if (LangOpts.MicrosoftExt && 
!isFPConstant71
) {
852
        // Allow i8, i16, i32, and i64. First, look ahead and check if
853
        // suffixes are Microsoft integers and not the imaginary unit.
854
63
        uint8_t Bits = 0;
855
63
        size_t ToSkip = 0;
856
63
        switch (s[1]) {
857
10
        case '8': // i8 suffix
858
10
          Bits = 8;
859
10
          ToSkip = 2;
860
10
          break;
861
9
        case '1':
862
9
          if (s[2] == '6') { // i16 suffix
863
9
            Bits = 16;
864
9
            ToSkip = 3;
865
9
          }
866
9
          break;
867
9
        case '3':
868
9
          if (s[2] == '2') { // i32 suffix
869
9
            Bits = 32;
870
9
            ToSkip = 3;
871
9
          }
872
9
          break;
873
32
        case '6':
874
32
          if (s[2] == '4') { // i64 suffix
875
32
            Bits = 64;
876
32
            ToSkip = 3;
877
32
          }
878
32
          break;
879
3
        default:
880
3
          break;
881
63
        }
882
63
        if (Bits) {
883
60
          if (HasSize)
884
6
            break;
885
54
          HasSize = true;
886
54
          MicrosoftInteger = Bits;
887
54
          s += ToSkip;
888
54
          assert(s <= ThisTokEnd && "didn't maximally munch?");
889
0
          break;
890
60
        }
891
63
      }
892
312
      
LLVM_FALLTHROUGH252
;252
893
378
    case 'j':
894
378
    case 'J':
895
378
      if (isImaginary) 
break0
; // Cannot be repeated.
896
378
      isImaginary = true;
897
378
      continue;  // Success.
898
560k
    }
899
    // If we reached here, there was an error or a ud-suffix.
900
428
    break;
901
560k
  }
902
903
  // "i", "if", and "il" are user-defined suffixes in C++1y.
904
6.26M
  if (s != ThisTokEnd || 
isImaginary6.26M
) {
905
    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
906
752
    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
907
752
    if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
908
247
      if (!isImaginary) {
909
        // Any suffix pieces we might have parsed are actually part of the
910
        // ud-suffix.
911
201
        isLong = false;
912
201
        isUnsigned = false;
913
201
        isLongLong = false;
914
201
        isSizeT = false;
915
201
        isFloat = false;
916
201
        isFloat16 = false;
917
201
        isHalf = false;
918
201
        isImaginary = false;
919
201
        MicrosoftInteger = 0;
920
201
        saw_fixed_point_suffix = false;
921
201
        isFract = false;
922
201
        isAccum = false;
923
201
      }
924
925
247
      saw_ud_suffix = true;
926
247
      return;
927
247
    }
928
929
505
    if (s != ThisTokEnd) {
930
      // Report an error if there are any.
931
173
      Diags.Report(Lexer::AdvanceToTokenCharacter(
932
173
                       TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
933
173
                   diag::err_invalid_suffix_constant)
934
173
          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
935
173
          << (isFixedPointConstant ? 
27
:
isFPConstant166
);
936
173
      hadError = true;
937
173
    }
938
505
  }
939
940
6.26M
  if (!hadError && 
saw_fixed_point_suffix6.26M
) {
941
808
    assert(isFract || isAccum);
942
808
  }
943
6.26M
}
944
945
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
946
/// numbers. It issues an error for illegal digits, and handles floating point
947
/// parsing. If it detects a floating point number, the radix is set to 10.
948
458k
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
949
458k
  assert((radix == 8 || radix == 10) && "Unexpected radix");
950
951
  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
952
  // the code is using an incorrect base.
953
458k
  if (isHexDigit(*s) && 
*s != 'e'366
&&
*s != 'E'45
&&
954
458k
      
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))12
) {
955
10
    Diags.Report(
956
10
        Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
957
10
        diag::err_invalid_digit)
958
10
        << StringRef(s, 1) << (radix == 8 ? 
19
:
01
);
959
10
    hadError = true;
960
10
    return;
961
10
  }
962
963
458k
  if (*s == '.') {
964
38.7k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
965
38.7k
    s++;
966
38.7k
    radix = 10;
967
38.7k
    saw_period = true;
968
38.7k
    checkSeparator(TokLoc, s, CSK_BeforeDigits);
969
38.7k
    s = SkipDigits(s); // Skip suffix.
970
38.7k
  }
971
458k
  if (*s == 'e' || 
*s == 'E'450k
) { // exponent
972
8.07k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
973
8.07k
    const char *Exponent = s;
974
8.07k
    s++;
975
8.07k
    radix = 10;
976
8.07k
    saw_exponent = true;
977
8.07k
    if (s != ThisTokEnd && 
(8.07k
*s == '+'8.07k
||
*s == '-'6.63k
))
s++7.70k
; // sign
978
8.07k
    const char *first_non_digit = SkipDigits(s);
979
8.07k
    if (containsDigits(s, first_non_digit)) {
980
8.06k
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
981
8.06k
      s = first_non_digit;
982
8.06k
    } else {
983
6
      if (!hadError) {
984
4
        Diags.Report(Lexer::AdvanceToTokenCharacter(
985
4
                         TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
986
4
                     diag::err_exponent_has_no_digits);
987
4
        hadError = true;
988
4
      }
989
6
      return;
990
6
    }
991
8.07k
  }
992
458k
}
993
994
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
995
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
996
/// treat it as an invalid suffix.
997
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
998
1.35k
                                           StringRef Suffix) {
999
1.35k
  if (!LangOpts.CPlusPlus11 || 
Suffix.empty()1.08k
)
1000
271
    return false;
1001
1002
  // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1003
1.08k
  if (Suffix[0] == '_')
1004
187
    return true;
1005
1006
  // In C++11, there are no library suffixes.
1007
894
  if (!LangOpts.CPlusPlus14)
1008
36
    return false;
1009
1010
  // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1011
  // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1012
  // In C++2a "d" and "y" are used in the library.
1013
858
  return llvm::StringSwitch<bool>(Suffix)
1014
858
      .Cases("h", "min", "s", true)
1015
858
      .Cases("ms", "us", "ns", true)
1016
858
      .Cases("il", "i", "if", true)
1017
858
      .Cases("d", "y", LangOpts.CPlusPlus20)
1018
858
      .Default(false);
1019
894
}
1020
1021
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1022
                                          const char *Pos,
1023
6.36M
                                          CheckSeparatorKind IsAfterDigits) {
1024
6.36M
  if (IsAfterDigits == CSK_AfterDigits) {
1025
6.31M
    if (Pos == ThisTokBegin)
1026
520
      return;
1027
6.31M
    --Pos;
1028
6.31M
  } else 
if (47.1k
Pos == ThisTokEnd47.1k
)
1029
568
    return;
1030
1031
6.36M
  if (isDigitSeparator(*Pos)) {
1032
36
    Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1033
36
                                                LangOpts),
1034
36
                 diag::err_digit_separator_not_between_digits)
1035
36
        << IsAfterDigits;
1036
36
    hadError = true;
1037
36
  }
1038
6.36M
}
1039
1040
/// ParseNumberStartingWithZero - This method is called when the first character
1041
/// of the number is found to be a zero.  This means it is either an octal
1042
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1043
/// a floating point number (01239.123e4).  Eat the prefix, determining the
1044
/// radix etc.
1045
1.09M
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1046
1.09M
  assert(s[0] == '0' && "Invalid method call");
1047
0
  s++;
1048
1049
1.09M
  int c1 = s[0];
1050
1051
  // Handle a hex number like 0x1234.
1052
1.09M
  if ((c1 == 'x' || 
c1 == 'X'235k
) &&
(858k
isHexDigit(s[1])858k
||
s[1] == '.'22
)) {
1053
858k
    s++;
1054
858k
    assert(s < ThisTokEnd && "didn't maximally munch?");
1055
0
    radix = 16;
1056
858k
    DigitsBegin = s;
1057
858k
    s = SkipHexDigits(s);
1058
858k
    bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1059
858k
    if (s == ThisTokEnd) {
1060
      // Done.
1061
810k
    } else 
if (48.3k
*s == '.'48.3k
) {
1062
133
      s++;
1063
133
      saw_period = true;
1064
133
      const char *floatDigitsBegin = s;
1065
133
      s = SkipHexDigits(s);
1066
133
      if (containsDigits(floatDigitsBegin, s))
1067
119
        HasSignificandDigits = true;
1068
133
      if (HasSignificandDigits)
1069
127
        checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1070
133
    }
1071
1072
858k
    if (!HasSignificandDigits) {
1073
6
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1074
6
                                                  LangOpts),
1075
6
                   diag::err_hex_constant_requires)
1076
6
          << LangOpts.CPlusPlus << 1;
1077
6
      hadError = true;
1078
6
      return;
1079
6
    }
1080
1081
    // A binary exponent can appear with or with a '.'. If dotted, the
1082
    // binary exponent is required.
1083
858k
    if (*s == 'p' || 
*s == 'P'858k
) {
1084
165
      checkSeparator(TokLoc, s, CSK_AfterDigits);
1085
165
      const char *Exponent = s;
1086
165
      s++;
1087
165
      saw_exponent = true;
1088
165
      if (s != ThisTokEnd && 
(163
*s == '+'163
||
*s == '-'143
))
s++44
; // sign
1089
165
      const char *first_non_digit = SkipDigits(s);
1090
165
      if (!containsDigits(s, first_non_digit)) {
1091
4
        if (!hadError) {
1092
2
          Diags.Report(Lexer::AdvanceToTokenCharacter(
1093
2
                           TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1094
2
                       diag::err_exponent_has_no_digits);
1095
2
          hadError = true;
1096
2
        }
1097
4
        return;
1098
4
      }
1099
161
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
1100
161
      s = first_non_digit;
1101
1102
161
      if (!LangOpts.HexFloats)
1103
45
        Diags.Report(TokLoc, LangOpts.CPlusPlus
1104
45
                                 ? 
diag::ext_hex_literal_invalid42
1105
45
                                 : 
diag::ext_hex_constant_invalid3
);
1106
116
      else if (LangOpts.CPlusPlus17)
1107
31
        Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1108
858k
    } else if (saw_period) {
1109
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1110
2
                                                  LangOpts),
1111
2
                   diag::err_hex_constant_requires)
1112
2
          << LangOpts.CPlusPlus << 0;
1113
2
      hadError = true;
1114
2
    }
1115
858k
    return;
1116
858k
  }
1117
1118
  // Handle simple binary numbers 0b01010
1119
235k
  if ((c1 == 'b' || 
c1 == 'B'235k
) &&
(86
s[1] == '0'86
||
s[1] == '1'64
)) {
1120
    // 0b101010 is a C++1y / GCC extension.
1121
81
    Diags.Report(TokLoc, LangOpts.CPlusPlus14
1122
81
                             ? 
diag::warn_cxx11_compat_binary_literal49
1123
81
                         : 
LangOpts.CPlusPlus32
?
diag::ext_binary_literal_cxx1415
1124
32
                                              : 
diag::ext_binary_literal17
);
1125
81
    ++s;
1126
81
    assert(s < ThisTokEnd && "didn't maximally munch?");
1127
0
    radix = 2;
1128
81
    DigitsBegin = s;
1129
81
    s = SkipBinaryDigits(s);
1130
81
    if (s == ThisTokEnd) {
1131
      // Done.
1132
62
    } else 
if (19
isHexDigit(*s)19
&&
1133
19
               
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))3
) {
1134
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1135
2
                                                  LangOpts),
1136
2
                   diag::err_invalid_digit)
1137
2
          << StringRef(s, 1) << 2;
1138
2
      hadError = true;
1139
2
    }
1140
    // Other suffixes will be diagnosed by the caller.
1141
81
    return;
1142
81
  }
1143
1144
  // For now, the radix is set to 8. If we discover that we have a
1145
  // floating point constant, the radix will change to 10. Octal floating
1146
  // point constants are not permitted (only decimal and hexadecimal).
1147
235k
  radix = 8;
1148
235k
  DigitsBegin = s;
1149
235k
  s = SkipOctalDigits(s);
1150
235k
  if (s == ThisTokEnd)
1151
211k
    return; // Done, simple octal number like 01234
1152
1153
  // If we have some other non-octal digit that *is* a decimal digit, see if
1154
  // this is part of a floating point number like 094.123 or 09e1.
1155
24.0k
  if (isDigit(*s)) {
1156
3
    const char *EndDecimal = SkipDigits(s);
1157
3
    if (EndDecimal[0] == '.' || 
EndDecimal[0] == 'e'2
||
EndDecimal[0] == 'E'2
) {
1158
1
      s = EndDecimal;
1159
1
      radix = 10;
1160
1
    }
1161
3
  }
1162
1163
24.0k
  ParseDecimalOrOctalCommon(TokLoc);
1164
24.0k
}
1165
1166
6.22M
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1167
6.22M
  switch (Radix) {
1168
78
  case 2:
1169
78
    return NumDigits <= 64;
1170
220k
  case 8:
1171
220k
    return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1172
5.15M
  case 10:
1173
5.15M
    return NumDigits <= 19; // floor(log10(2^64))
1174
858k
  case 16:
1175
858k
    return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1176
0
  default:
1177
0
    llvm_unreachable("impossible Radix");
1178
6.22M
  }
1179
6.22M
}
1180
1181
/// GetIntegerValue - Convert this numeric literal value to an APInt that
1182
/// matches Val's input width.  If there is an overflow, set Val to the low bits
1183
/// of the result and return true.  Otherwise, return false.
1184
6.22M
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1185
  // Fast path: Compute a conservative bound on the maximum number of
1186
  // bits per digit in this radix. If we can't possibly overflow a
1187
  // uint64 based on that bound then do the simple conversion to
1188
  // integer. This avoids the expensive overflow checking below, and
1189
  // handles the common cases that matter (small decimal integers and
1190
  // hex/octal values which don't overflow).
1191
6.22M
  const unsigned NumDigits = SuffixBegin - DigitsBegin;
1192
6.22M
  if (alwaysFitsInto64Bits(radix, NumDigits)) {
1193
6.22M
    uint64_t N = 0;
1194
25.3M
    for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; 
++Ptr19.1M
)
1195
19.1M
      if (!isDigitSeparator(*Ptr))
1196
19.1M
        N = N * radix + llvm::hexDigitValue(*Ptr);
1197
1198
    // This will truncate the value to Val's input width. Simply check
1199
    // for overflow by comparing.
1200
6.22M
    Val = N;
1201
6.22M
    return Val.getZExtValue() != N;
1202
6.22M
  }
1203
1204
192
  Val = 0;
1205
192
  const char *Ptr = DigitsBegin;
1206
1207
192
  llvm::APInt RadixVal(Val.getBitWidth(), radix);
1208
192
  llvm::APInt CharVal(Val.getBitWidth(), 0);
1209
192
  llvm::APInt OldVal = Val;
1210
1211
192
  bool OverflowOccurred = false;
1212
4.08k
  while (Ptr < SuffixBegin) {
1213
3.89k
    if (isDigitSeparator(*Ptr)) {
1214
30
      ++Ptr;
1215
30
      continue;
1216
30
    }
1217
1218
3.86k
    unsigned C = llvm::hexDigitValue(*Ptr++);
1219
1220
    // If this letter is out of bound for this radix, reject it.
1221
3.86k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1222
1223
0
    CharVal = C;
1224
1225
    // Add the digit to the value in the appropriate radix.  If adding in digits
1226
    // made the value smaller, then this overflowed.
1227
3.86k
    OldVal = Val;
1228
1229
    // Multiply by radix, did overflow occur on the multiply?
1230
3.86k
    Val *= RadixVal;
1231
3.86k
    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1232
1233
    // Add value, did overflow occur on the value?
1234
    //   (a + b) ult b  <=> overflow
1235
3.86k
    Val += CharVal;
1236
3.86k
    OverflowOccurred |= Val.ult(CharVal);
1237
3.86k
  }
1238
192
  return OverflowOccurred;
1239
6.22M
}
1240
1241
llvm::APFloat::opStatus
1242
38.3k
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1243
38.3k
  using llvm::APFloat;
1244
1245
38.3k
  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1246
1247
38.3k
  llvm::SmallString<16> Buffer;
1248
38.3k
  StringRef Str(ThisTokBegin, n);
1249
38.3k
  if (Str.contains('\'')) {
1250
6
    Buffer.reserve(n);
1251
6
    std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1252
6
                        &isDigitSeparator);
1253
6
    Str = Buffer;
1254
6
  }
1255
1256
38.3k
  auto StatusOrErr =
1257
38.3k
      Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1258
38.3k
  assert(StatusOrErr && "Invalid floating point representation");
1259
38.3k
  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1260
38.3k
                                               : 
APFloat::opInvalidOp0
;
1261
38.3k
}
1262
1263
270
static inline bool IsExponentPart(char c) {
1264
270
  return c == 'p' || 
c == 'P'232
||
c == 'e'228
||
c == 'E'207
;
1265
270
}
1266
1267
808
bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1268
808
  assert(radix == 16 || radix == 10);
1269
1270
  // Find how many digits are needed to store the whole literal.
1271
0
  unsigned NumDigits = SuffixBegin - DigitsBegin;
1272
808
  if (saw_period) 
--NumDigits775
;
1273
1274
  // Initial scan of the exponent if it exists
1275
808
  bool ExpOverflowOccurred = false;
1276
808
  bool NegativeExponent = false;
1277
808
  const char *ExponentBegin;
1278
808
  uint64_t Exponent = 0;
1279
808
  int64_t BaseShift = 0;
1280
808
  if (saw_exponent) {
1281
67
    const char *Ptr = DigitsBegin;
1282
1283
270
    while (!IsExponentPart(*Ptr)) 
++Ptr203
;
1284
67
    ExponentBegin = Ptr;
1285
67
    ++Ptr;
1286
67
    NegativeExponent = *Ptr == '-';
1287
67
    if (NegativeExponent) 
++Ptr26
;
1288
1289
67
    unsigned NumExpDigits = SuffixBegin - Ptr;
1290
67
    if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1291
66
      llvm::StringRef ExpStr(Ptr, NumExpDigits);
1292
66
      llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1293
66
      Exponent = ExpInt.getZExtValue();
1294
66
    } else {
1295
1
      ExpOverflowOccurred = true;
1296
1
    }
1297
1298
67
    if (NegativeExponent) 
BaseShift -= Exponent26
;
1299
41
    else BaseShift += Exponent;
1300
67
  }
1301
1302
  // Number of bits needed for decimal literal is
1303
  //   ceil(NumDigits * log2(10))       Integral part
1304
  // + Scale                            Fractional part
1305
  // + ceil(Exponent * log2(10))        Exponent
1306
  // --------------------------------------------------
1307
  //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1308
  //
1309
  // But for simplicity in handling integers, we can round up log2(10) to 4,
1310
  // making:
1311
  // 4 * (NumDigits + Exponent) + Scale
1312
  //
1313
  // Number of digits needed for hexadecimal literal is
1314
  //   4 * NumDigits                    Integral part
1315
  // + Scale                            Fractional part
1316
  // + Exponent                         Exponent
1317
  // --------------------------------------------------
1318
  //   (4 * NumDigits) + Scale + Exponent
1319
808
  uint64_t NumBitsNeeded;
1320
808
  if (radix == 10)
1321
766
    NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1322
42
  else
1323
42
    NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1324
1325
808
  if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1326
0
    ExpOverflowOccurred = true;
1327
808
  llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1328
1329
808
  bool FoundDecimal = false;
1330
1331
808
  int64_t FractBaseShift = 0;
1332
808
  const char *End = saw_exponent ? 
ExponentBegin67
:
SuffixBegin741
;
1333
4.08k
  for (const char *Ptr = DigitsBegin; Ptr < End; 
++Ptr3.27k
) {
1334
3.27k
    if (*Ptr == '.') {
1335
775
      FoundDecimal = true;
1336
775
      continue;
1337
775
    }
1338
1339
    // Normal reading of an integer
1340
2.50k
    unsigned C = llvm::hexDigitValue(*Ptr);
1341
2.50k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1342
1343
0
    Val *= radix;
1344
2.50k
    Val += C;
1345
1346
2.50k
    if (FoundDecimal)
1347
      // Keep track of how much we will need to adjust this value by from the
1348
      // number of digits past the radix point.
1349
1.63k
      --FractBaseShift;
1350
2.50k
  }
1351
1352
  // For a radix of 16, we will be multiplying by 2 instead of 16.
1353
808
  if (radix == 16) 
FractBaseShift *= 442
;
1354
808
  BaseShift += FractBaseShift;
1355
1356
808
  Val <<= Scale;
1357
1358
808
  uint64_t Base = (radix == 16) ? 
242
:
10766
;
1359
808
  if (BaseShift > 0) {
1360
146
    for (int64_t i = 0; i < BaseShift; 
++i136
) {
1361
136
      Val *= Base;
1362
136
    }
1363
798
  } else if (BaseShift < 0) {
1364
2.80k
    for (int64_t i = BaseShift; i < 0 && 
!Val.isZero()2.05k
;
++i2.02k
)
1365
2.02k
      Val = Val.udiv(Base);
1366
781
  }
1367
1368
808
  bool IntOverflowOccurred = false;
1369
808
  auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1370
808
  if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1371
407
    IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1372
407
    StoreVal = Val.trunc(StoreVal.getBitWidth());
1373
407
  } else 
if (401
Val.getBitWidth() < StoreVal.getBitWidth()401
) {
1374
374
    IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1375
374
    StoreVal = Val.zext(StoreVal.getBitWidth());
1376
374
  } else {
1377
27
    StoreVal = Val;
1378
27
  }
1379
1380
808
  return IntOverflowOccurred || 
ExpOverflowOccurred778
;
1381
808
}
1382
1383
/// \verbatim
1384
///       user-defined-character-literal: [C++11 lex.ext]
1385
///         character-literal ud-suffix
1386
///       ud-suffix:
1387
///         identifier
1388
///       character-literal: [C++11 lex.ccon]
1389
///         ' c-char-sequence '
1390
///         u' c-char-sequence '
1391
///         U' c-char-sequence '
1392
///         L' c-char-sequence '
1393
///         u8' c-char-sequence ' [C++1z lex.ccon]
1394
///       c-char-sequence:
1395
///         c-char
1396
///         c-char-sequence c-char
1397
///       c-char:
1398
///         any member of the source character set except the single-quote ',
1399
///           backslash \, or new-line character
1400
///         escape-sequence
1401
///         universal-character-name
1402
///       escape-sequence:
1403
///         simple-escape-sequence
1404
///         octal-escape-sequence
1405
///         hexadecimal-escape-sequence
1406
///       simple-escape-sequence:
1407
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1408
///       octal-escape-sequence:
1409
///         \ octal-digit
1410
///         \ octal-digit octal-digit
1411
///         \ octal-digit octal-digit octal-digit
1412
///       hexadecimal-escape-sequence:
1413
///         \x hexadecimal-digit
1414
///         hexadecimal-escape-sequence hexadecimal-digit
1415
///       universal-character-name: [C++11 lex.charset]
1416
///         \u hex-quad
1417
///         \U hex-quad hex-quad
1418
///       hex-quad:
1419
///         hex-digit hex-digit hex-digit hex-digit
1420
/// \endverbatim
1421
///
1422
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1423
                                     SourceLocation Loc, Preprocessor &PP,
1424
548k
                                     tok::TokenKind kind) {
1425
  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1426
548k
  HadError = false;
1427
1428
548k
  Kind = kind;
1429
1430
548k
  const char *TokBegin = begin;
1431
1432
  // Skip over wide character determinant.
1433
548k
  if (Kind != tok::char_constant)
1434
1.47k
    ++begin;
1435
548k
  if (Kind == tok::utf8_char_constant)
1436
151
    ++begin;
1437
1438
  // Skip over the entry quote.
1439
548k
  if (begin[0] != '\'') {
1440
0
    PP.Diag(Loc, diag::err_lexing_char);
1441
0
    HadError = true;
1442
0
    return;
1443
0
  }
1444
1445
548k
  ++begin;
1446
1447
  // Remove an optional ud-suffix.
1448
548k
  if (end[-1] != '\'') {
1449
59
    const char *UDSuffixEnd = end;
1450
216
    do {
1451
216
      --end;
1452
216
    } while (end[-1] != '\'');
1453
    // FIXME: Don't bother with this if !tok.hasUCN().
1454
59
    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1455
59
    UDSuffixOffset = end - TokBegin;
1456
59
  }
1457
1458
  // Trim the ending quote.
1459
548k
  assert(end != begin && "Invalid token lexed");
1460
0
  --end;
1461
1462
  // FIXME: The "Value" is an uint64_t so we can handle char literals of
1463
  // up to 64-bits.
1464
  // FIXME: This extensively assumes that 'char' is 8-bits.
1465
548k
  assert(PP.getTargetInfo().getCharWidth() == 8 &&
1466
548k
         "Assumes char is 8 bits");
1467
0
  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1468
548k
         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1469
548k
         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1470
0
  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1471
548k
         "Assumes sizeof(wchar) on target is <= 64");
1472
1473
0
  SmallVector<uint32_t, 4> codepoint_buffer;
1474
548k
  codepoint_buffer.resize(end - begin);
1475
548k
  uint32_t *buffer_begin = &codepoint_buffer.front();
1476
548k
  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1477
1478
  // Unicode escapes representing characters that cannot be correctly
1479
  // represented in a single code unit are disallowed in character literals
1480
  // by this implementation.
1481
548k
  uint32_t largest_character_for_kind;
1482
548k
  if (tok::wide_char_constant == Kind) {
1483
1.12k
    largest_character_for_kind =
1484
1.12k
        0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1485
547k
  } else if (tok::utf8_char_constant == Kind) {
1486
151
    largest_character_for_kind = 0x7F;
1487
547k
  } else if (tok::utf16_char_constant == Kind) {
1488
94
    largest_character_for_kind = 0xFFFF;
1489
547k
  } else if (tok::utf32_char_constant == Kind) {
1490
108
    largest_character_for_kind = 0x10FFFF;
1491
547k
  } else {
1492
547k
    largest_character_for_kind = 0x7Fu;
1493
547k
  }
1494
1495
1.09M
  while (begin != end) {
1496
    // Is this a span of non-escape characters?
1497
549k
    if (begin[0] != '\\') {
1498
544k
      char const *start = begin;
1499
2.08M
      do {
1500
2.08M
        ++begin;
1501
2.08M
      } while (begin != end && 
*begin != '\\'1.54M
);
1502
1503
544k
      char const *tmp_in_start = start;
1504
544k
      uint32_t *tmp_out_start = buffer_begin;
1505
544k
      llvm::ConversionResult res =
1506
544k
          llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1507
544k
                             reinterpret_cast<llvm::UTF8 const *>(begin),
1508
544k
                             &buffer_begin, buffer_end, llvm::strictConversion);
1509
544k
      if (res != llvm::conversionOK) {
1510
        // If we see bad encoding for unprefixed character literals, warn and
1511
        // simply copy the byte values, for compatibility with gcc and
1512
        // older versions of clang.
1513
6
        bool NoErrorOnBadEncoding = isAscii();
1514
6
        unsigned Msg = diag::err_bad_character_encoding;
1515
6
        if (NoErrorOnBadEncoding)
1516
3
          Msg = diag::warn_bad_character_encoding;
1517
6
        PP.Diag(Loc, Msg);
1518
6
        if (NoErrorOnBadEncoding) {
1519
3
          start = tmp_in_start;
1520
3
          buffer_begin = tmp_out_start;
1521
7
          for (; start != begin; 
++start, ++buffer_begin4
)
1522
4
            *buffer_begin = static_cast<uint8_t>(*start);
1523
3
        } else {
1524
3
          HadError = true;
1525
3
        }
1526
544k
      } else {
1527
2.63M
        for (; tmp_out_start < buffer_begin; 
++tmp_out_start2.08M
) {
1528
2.08M
          if (*tmp_out_start > largest_character_for_kind) {
1529
13
            HadError = true;
1530
13
            PP.Diag(Loc, diag::err_character_too_large);
1531
13
          }
1532
2.08M
        }
1533
544k
      }
1534
1535
544k
      continue;
1536
544k
    }
1537
    // Is this a Universal Character Name escape?
1538
4.85k
    if (begin[1] == 'u' || 
begin[1] == 'U'4.75k
) {
1539
119
      unsigned short UcnLen = 0;
1540
119
      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1541
119
                            FullSourceLoc(Loc, PP.getSourceManager()),
1542
119
                            &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1543
35
        HadError = true;
1544
84
      } else if (*buffer_begin > largest_character_for_kind) {
1545
11
        HadError = true;
1546
11
        PP.Diag(Loc, diag::err_character_too_large);
1547
11
      }
1548
1549
119
      ++buffer_begin;
1550
119
      continue;
1551
119
    }
1552
4.73k
    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1553
4.73k
    uint64_t result =
1554
4.73k
      ProcessCharEscape(TokBegin, begin, end, HadError,
1555
4.73k
                        FullSourceLoc(Loc,PP.getSourceManager()),
1556
4.73k
                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1557
4.73k
    *buffer_begin++ = result;
1558
4.73k
  }
1559
1560
548k
  unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1561
1562
548k
  if (NumCharsSoFar > 1) {
1563
515k
    if (isAscii() && 
NumCharsSoFar == 4515k
)
1564
515k
      PP.Diag(Loc, diag::warn_four_char_character_literal);
1565
37
    else if (isAscii())
1566
28
      PP.Diag(Loc, diag::warn_multichar_character_literal);
1567
9
    else {
1568
9
      PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 
04
:
15
);
1569
9
      HadError = true;
1570
9
    }
1571
515k
    IsMultiChar = true;
1572
515k
  } else {
1573
33.2k
    IsMultiChar = false;
1574
33.2k
  }
1575
1576
548k
  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1577
1578
  // Narrow character literals act as though their value is concatenated
1579
  // in this implementation, but warn on overflow.
1580
548k
  bool multi_char_too_long = false;
1581
548k
  if (isAscii() && 
isMultiChar()547k
) {
1582
515k
    LitVal = 0;
1583
2.57M
    for (size_t i = 0; i < NumCharsSoFar; 
++i2.06M
) {
1584
      // check for enough leading zeros to shift into
1585
2.06M
      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1586
2.06M
      LitVal <<= 8;
1587
2.06M
      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1588
2.06M
    }
1589
515k
  } else 
if (33.2k
NumCharsSoFar > 033.2k
) {
1590
    // otherwise just take the last character
1591
33.2k
    LitVal = buffer_begin[-1];
1592
33.2k
  }
1593
1594
548k
  if (!HadError && 
multi_char_too_long548k
) {
1595
3
    PP.Diag(Loc, diag::warn_char_constant_too_large);
1596
3
  }
1597
1598
  // Transfer the value from APInt to uint64_t
1599
548k
  Value = LitVal.getZExtValue();
1600
1601
  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1602
  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1603
  // character constants are not sign extended in the this implementation:
1604
  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1605
548k
  if (isAscii() && 
NumCharsSoFar == 1547k
&&
(Value & 128)31.8k
&&
1606
548k
      
PP.getLangOpts().CharIsSigned113
)
1607
91
    Value = (signed char)Value;
1608
548k
}
1609
1610
/// \verbatim
1611
///       string-literal: [C++0x lex.string]
1612
///         encoding-prefix " [s-char-sequence] "
1613
///         encoding-prefix R raw-string
1614
///       encoding-prefix:
1615
///         u8
1616
///         u
1617
///         U
1618
///         L
1619
///       s-char-sequence:
1620
///         s-char
1621
///         s-char-sequence s-char
1622
///       s-char:
1623
///         any member of the source character set except the double-quote ",
1624
///           backslash \, or new-line character
1625
///         escape-sequence
1626
///         universal-character-name
1627
///       raw-string:
1628
///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1629
///       r-char-sequence:
1630
///         r-char
1631
///         r-char-sequence r-char
1632
///       r-char:
1633
///         any member of the source character set, except a right parenthesis )
1634
///           followed by the initial d-char-sequence (which may be empty)
1635
///           followed by a double quote ".
1636
///       d-char-sequence:
1637
///         d-char
1638
///         d-char-sequence d-char
1639
///       d-char:
1640
///         any member of the basic source character set except:
1641
///           space, the left parenthesis (, the right parenthesis ),
1642
///           the backslash \, and the control characters representing horizontal
1643
///           tab, vertical tab, form feed, and newline.
1644
///       escape-sequence: [C++0x lex.ccon]
1645
///         simple-escape-sequence
1646
///         octal-escape-sequence
1647
///         hexadecimal-escape-sequence
1648
///       simple-escape-sequence:
1649
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1650
///       octal-escape-sequence:
1651
///         \ octal-digit
1652
///         \ octal-digit octal-digit
1653
///         \ octal-digit octal-digit octal-digit
1654
///       hexadecimal-escape-sequence:
1655
///         \x hexadecimal-digit
1656
///         hexadecimal-escape-sequence hexadecimal-digit
1657
///       universal-character-name:
1658
///         \u hex-quad
1659
///         \U hex-quad hex-quad
1660
///       hex-quad:
1661
///         hex-digit hex-digit hex-digit hex-digit
1662
/// \endverbatim
1663
///
1664
StringLiteralParser::
1665
StringLiteralParser(ArrayRef<Token> StringToks,
1666
                    Preprocessor &PP)
1667
  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1668
    Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1669
    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1670
4.32M
    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1671
4.32M
  init(StringToks);
1672
4.32M
}
1673
1674
5.88M
void StringLiteralParser::init(ArrayRef<Token> StringToks){
1675
  // The literal token may have come from an invalid source location (e.g. due
1676
  // to a PCH error), in which case the token length will be 0.
1677
5.88M
  if (
StringToks.empty()5.88M
|| StringToks[0].getLength() < 2)
1678
0
    return DiagnoseLexingError(SourceLocation());
1679
1680
  // Scan all of the string portions, remember the max individual token length,
1681
  // computing a bound on the concatenated string length, and see whether any
1682
  // piece is a wide-string.  If any of the string portions is a wide-string
1683
  // literal, the result is a wide-string literal [C99 6.4.5p4].
1684
5.88M
  assert(!StringToks.empty() && "expected at least one token");
1685
0
  MaxTokenLength = StringToks[0].getLength();
1686
5.88M
  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1687
0
  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1688
5.88M
  Kind = StringToks[0].getKind();
1689
1690
5.88M
  hadError = false;
1691
1692
  // Implement Translation Phase #6: concatenation of string literals
1693
  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1694
6.05M
  for (unsigned i = 1; i != StringToks.size(); 
++i169k
) {
1695
169k
    if (StringToks[i].getLength() < 2)
1696
0
      return DiagnoseLexingError(StringToks[i].getLocation());
1697
1698
    // The string could be shorter than this if it needs cleaning, but this is a
1699
    // reasonable bound, which is all we need.
1700
169k
    assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1701
0
    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1702
1703
    // Remember maximum string piece length.
1704
169k
    if (StringToks[i].getLength() > MaxTokenLength)
1705
143k
      MaxTokenLength = StringToks[i].getLength();
1706
1707
    // Remember if we see any wide or utf-8/16/32 strings.
1708
    // Also check for illegal concatenations.
1709
169k
    if (StringToks[i].isNot(Kind) && 
StringToks[i].isNot(tok::string_literal)82
) {
1710
55
      if (isAscii()) {
1711
19
        Kind = StringToks[i].getKind();
1712
36
      } else {
1713
36
        if (Diags)
1714
36
          Diags->Report(StringToks[i].getLocation(),
1715
36
                        diag::err_unsupported_string_concat);
1716
36
        hadError = true;
1717
36
      }
1718
55
    }
1719
169k
  }
1720
1721
  // Include space for the null terminator.
1722
5.88M
  ++SizeBound;
1723
1724
  // TODO: K&R warning: "traditional C rejects string constant concatenation"
1725
1726
  // Get the width in bytes of char/wchar_t/char16_t/char32_t
1727
5.88M
  CharByteWidth = getCharWidth(Kind, Target);
1728
5.88M
  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1729
0
  CharByteWidth /= 8;
1730
1731
  // The output buffer size needs to be large enough to hold wide characters.
1732
  // This is a worst-case assumption which basically corresponds to L"" "long".
1733
5.88M
  SizeBound *= CharByteWidth;
1734
1735
  // Size the temporary buffer to hold the result string data.
1736
5.88M
  ResultBuf.resize(SizeBound);
1737
1738
  // Likewise, but for each string piece.
1739
5.88M
  SmallString<512> TokenBuf;
1740
5.88M
  TokenBuf.resize(MaxTokenLength);
1741
1742
  // Loop over all the strings, getting their spelling, and expanding them to
1743
  // wide strings as appropriate.
1744
5.88M
  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1745
1746
5.88M
  Pascal = false;
1747
1748
5.88M
  SourceLocation UDSuffixTokLoc;
1749
1750
11.9M
  for (unsigned i = 0, e = StringToks.size(); i != e; 
++i6.05M
) {
1751
6.05M
    const char *ThisTokBuf = &TokenBuf[0];
1752
    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1753
    // that ThisTokBuf points to a buffer that is big enough for the whole token
1754
    // and 'spelled' tokens can only shrink.
1755
6.05M
    bool StringInvalid = false;
1756
6.05M
    unsigned ThisTokLen =
1757
6.05M
      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1758
6.05M
                         &StringInvalid);
1759
6.05M
    if (StringInvalid)
1760
0
      return DiagnoseLexingError(StringToks[i].getLocation());
1761
1762
6.05M
    const char *ThisTokBegin = ThisTokBuf;
1763
6.05M
    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1764
1765
    // Remove an optional ud-suffix.
1766
6.05M
    if (ThisTokEnd[-1] != '"') {
1767
469
      const char *UDSuffixEnd = ThisTokEnd;
1768
1.35k
      do {
1769
1.35k
        --ThisTokEnd;
1770
1.35k
      } while (ThisTokEnd[-1] != '"');
1771
1772
469
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1773
1774
469
      if (UDSuffixBuf.empty()) {
1775
451
        if (StringToks[i].hasUCN())
1776
9
          expandUCNs(UDSuffixBuf, UDSuffix);
1777
442
        else
1778
442
          UDSuffixBuf.assign(UDSuffix);
1779
451
        UDSuffixToken = i;
1780
451
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1781
451
        UDSuffixTokLoc = StringToks[i].getLocation();
1782
451
      } else {
1783
18
        SmallString<32> ExpandedUDSuffix;
1784
18
        if (StringToks[i].hasUCN()) {
1785
9
          expandUCNs(ExpandedUDSuffix, UDSuffix);
1786
9
          UDSuffix = ExpandedUDSuffix;
1787
9
        }
1788
1789
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1790
        // result of a concatenation involving at least one user-defined-string-
1791
        // literal, all the participating user-defined-string-literals shall
1792
        // have the same ud-suffix.
1793
18
        if (UDSuffixBuf != UDSuffix) {
1794
6
          if (Diags) {
1795
6
            SourceLocation TokLoc = StringToks[i].getLocation();
1796
6
            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1797
6
              << UDSuffixBuf << UDSuffix
1798
6
              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1799
6
              << SourceRange(TokLoc, TokLoc);
1800
6
          }
1801
6
          hadError = true;
1802
6
        }
1803
18
      }
1804
469
    }
1805
1806
    // Strip the end quote.
1807
6.05M
    --ThisTokEnd;
1808
1809
    // TODO: Input character set mapping support.
1810
1811
    // Skip marker for wide or unicode strings.
1812
6.05M
    if (ThisTokBuf[0] == 'L' || 
ThisTokBuf[0] == 'u'6.05M
||
ThisTokBuf[0] == 'U'6.05M
) {
1813
1.78k
      ++ThisTokBuf;
1814
      // Skip 8 of u8 marker for utf8 strings.
1815
1.78k
      if (ThisTokBuf[0] == '8')
1816
319
        ++ThisTokBuf;
1817
1.78k
    }
1818
1819
    // Check for raw string
1820
6.05M
    if (ThisTokBuf[0] == 'R') {
1821
132
      if (ThisTokBuf[1] != '"') {
1822
        // The file may have come from PCH and then changed after loading the
1823
        // PCH; Fail gracefully.
1824
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1825
0
      }
1826
132
      ThisTokBuf += 2; // skip R"
1827
1828
      // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
1829
      // characters.
1830
132
      constexpr unsigned MaxRawStrDelimLen = 16;
1831
1832
132
      const char *Prefix = ThisTokBuf;
1833
347
      while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
1834
347
             ThisTokBuf[0] != '(')
1835
215
        ++ThisTokBuf;
1836
132
      if (ThisTokBuf[0] != '(')
1837
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1838
132
      ++ThisTokBuf; // skip '('
1839
1840
      // Remove same number of characters from the end
1841
132
      ThisTokEnd -= ThisTokBuf - Prefix;
1842
132
      if (ThisTokEnd < ThisTokBuf)
1843
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1844
1845
      // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1846
      // results in a new-line in the resulting execution string-literal.
1847
132
      StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1848
261
      while (!RemainingTokenSpan.empty()) {
1849
        // Split the string literal on \r\n boundaries.
1850
129
        size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1851
129
        StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1852
129
        StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1853
1854
        // Copy everything before the \r\n sequence into the string literal.
1855
129
        if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1856
6
          hadError = true;
1857
1858
        // Point into the \n inside the \r\n sequence and operate on the
1859
        // remaining portion of the literal.
1860
129
        RemainingTokenSpan = AfterCRLF.substr(1);
1861
129
      }
1862
6.05M
    } else {
1863
6.05M
      if (ThisTokBuf[0] != '"') {
1864
        // The file may have come from PCH and then changed after loading the
1865
        // PCH; Fail gracefully.
1866
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1867
0
      }
1868
6.05M
      ++ThisTokBuf; // skip "
1869
1870
      // Check if this is a pascal string
1871
6.05M
      if (Features.PascalStrings && 
ThisTokBuf + 1 != ThisTokEnd98
&&
1872
6.05M
          
ThisTokBuf[0] == '\\'98
&&
ThisTokBuf[1] == 'p'15
) {
1873
1874
        // If the \p sequence is found in the first token, we have a pascal string
1875
        // Otherwise, if we already have a pascal string, ignore the first \p
1876
15
        if (i == 0) {
1877
15
          ++ThisTokBuf;
1878
15
          Pascal = true;
1879
15
        } else 
if (0
Pascal0
)
1880
0
          ThisTokBuf += 2;
1881
15
      }
1882
1883
12.1M
      while (ThisTokBuf != ThisTokEnd) {
1884
        // Is this a span of non-escape characters?
1885
6.07M
        if (ThisTokBuf[0] != '\\') {
1886
5.97M
          const char *InStart = ThisTokBuf;
1887
83.1M
          do {
1888
83.1M
            ++ThisTokBuf;
1889
83.1M
          } while (ThisTokBuf != ThisTokEnd && 
ThisTokBuf[0] != '\\'77.2M
);
1890
1891
          // Copy the character span over.
1892
5.97M
          if (CopyStringFragment(StringToks[i], ThisTokBegin,
1893
5.97M
                                 StringRef(InStart, ThisTokBuf - InStart)))
1894
8
            hadError = true;
1895
5.97M
          continue;
1896
5.97M
        }
1897
        // Is this a Universal Character Name escape?
1898
98.8k
        if (ThisTokBuf[1] == 'u' || 
ThisTokBuf[1] == 'U'98.6k
) {
1899
317
          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1900
317
                          ResultPtr, hadError,
1901
317
                          FullSourceLoc(StringToks[i].getLocation(), SM),
1902
317
                          CharByteWidth, Diags, Features);
1903
317
          continue;
1904
317
        }
1905
        // Otherwise, this is a non-UCN escape character.  Process it.
1906
98.5k
        unsigned ResultChar =
1907
98.5k
          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1908
98.5k
                            FullSourceLoc(StringToks[i].getLocation(), SM),
1909
98.5k
                            CharByteWidth*8, Diags, Features);
1910
1911
98.5k
        if (CharByteWidth == 4) {
1912
          // FIXME: Make the type of the result buffer correct instead of
1913
          // using reinterpret_cast.
1914
245
          llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
1915
245
          *ResultWidePtr = ResultChar;
1916
245
          ResultPtr += 4;
1917
98.2k
        } else if (CharByteWidth == 2) {
1918
          // FIXME: Make the type of the result buffer correct instead of
1919
          // using reinterpret_cast.
1920
31
          llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
1921
31
          *ResultWidePtr = ResultChar & 0xFFFF;
1922
31
          ResultPtr += 2;
1923
98.2k
        } else {
1924
98.2k
          assert(CharByteWidth == 1 && "Unexpected char width");
1925
0
          *ResultPtr++ = ResultChar & 0xFF;
1926
98.2k
        }
1927
98.5k
      }
1928
6.05M
    }
1929
6.05M
  }
1930
1931
5.88M
  if (Pascal) {
1932
15
    if (CharByteWidth == 4) {
1933
      // FIXME: Make the type of the result buffer correct instead of
1934
      // using reinterpret_cast.
1935
1
      llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
1936
1
      ResultWidePtr[0] = GetNumStringChars() - 1;
1937
14
    } else if (CharByteWidth == 2) {
1938
      // FIXME: Make the type of the result buffer correct instead of
1939
      // using reinterpret_cast.
1940
3
      llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
1941
3
      ResultWidePtr[0] = GetNumStringChars() - 1;
1942
11
    } else {
1943
11
      assert(CharByteWidth == 1 && "Unexpected char width");
1944
0
      ResultBuf[0] = GetNumStringChars() - 1;
1945
11
    }
1946
1947
    // Verify that pascal strings aren't too large.
1948
15
    if (GetStringLength() > 256) {
1949
0
      if (Diags)
1950
0
        Diags->Report(StringToks.front().getLocation(),
1951
0
                      diag::err_pascal_string_too_long)
1952
0
          << SourceRange(StringToks.front().getLocation(),
1953
0
                         StringToks.back().getLocation());
1954
0
      hadError = true;
1955
0
      return;
1956
0
    }
1957
5.88M
  } else if (Diags) {
1958
    // Complain if this string literal has too many characters.
1959
4.32M
    unsigned MaxChars = Features.CPlusPlus? 
655362.25M
:
Features.C992.06M
?
40952.06M
:
5093.38k
;
1960
1961
4.32M
    if (GetNumStringChars() > MaxChars)
1962
1
      Diags->Report(StringToks.front().getLocation(),
1963
1
                    diag::ext_string_too_long)
1964
1
        << GetNumStringChars() << MaxChars
1965
1
        << (Features.CPlusPlus ? 
20
: Features.C99 ?
10
: 0)
1966
1
        << SourceRange(StringToks.front().getLocation(),
1967
1
                       StringToks.back().getLocation());
1968
4.32M
  }
1969
5.88M
}
1970
1971
212
static const char *resyncUTF8(const char *Err, const char *End) {
1972
212
  if (Err == End)
1973
0
    return End;
1974
212
  End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
1975
239
  while (++Err != End && 
(*Err & 0xC0) == 0x80186
)
1976
27
    ;
1977
212
  return Err;
1978
212
}
1979
1980
/// This function copies from Fragment, which is a sequence of bytes
1981
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
1982
/// Performs widening for multi-byte characters.
1983
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1984
                                             const char *TokBegin,
1985
5.97M
                                             StringRef Fragment) {
1986
5.97M
  const llvm::UTF8 *ErrorPtrTmp;
1987
5.97M
  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1988
5.97M
    return false;
1989
1990
  // If we see bad encoding for unprefixed string literals, warn and
1991
  // simply copy the byte values, for compatibility with gcc and older
1992
  // versions of clang.
1993
50
  bool NoErrorOnBadEncoding = isAscii();
1994
50
  if (NoErrorOnBadEncoding) {
1995
20
    memcpy(ResultPtr, Fragment.data(), Fragment.size());
1996
20
    ResultPtr += Fragment.size();
1997
20
  }
1998
1999
50
  if (Diags) {
2000
34
    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2001
2002
34
    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2003
34
    const DiagnosticBuilder &Builder =
2004
34
      Diag(Diags, Features, SourceLoc, TokBegin,
2005
34
           ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2006
34
           NoErrorOnBadEncoding ? 
diag::warn_bad_string_encoding20
2007
34
                                : 
diag::err_bad_string_encoding14
);
2008
2009
34
    const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2010
34
    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2011
2012
    // Decode into a dummy buffer.
2013
34
    SmallString<512> Dummy;
2014
34
    Dummy.reserve(Fragment.size() * CharByteWidth);
2015
34
    char *Ptr = Dummy.data();
2016
2017
178
    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2018
144
      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2019
144
      NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2020
144
      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2021
144
                                     ErrorPtr, NextStart);
2022
144
      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2023
144
    }
2024
34
  }
2025
50
  return !NoErrorOnBadEncoding;
2026
5.97M
}
2027
2028
0
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2029
0
  hadError = true;
2030
0
  if (Diags)
2031
0
    Diags->Report(Loc, diag::err_lexing_string);
2032
0
}
2033
2034
/// getOffsetOfStringByte - This function returns the offset of the
2035
/// specified byte of the string data represented by Token.  This handles
2036
/// advancing over escape sequences in the string.
2037
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2038
32.1k
                                                    unsigned ByteNo) const {
2039
  // Get the spelling of the token.
2040
32.1k
  SmallString<32> SpellingBuffer;
2041
32.1k
  SpellingBuffer.resize(Tok.getLength());
2042
2043
32.1k
  bool StringInvalid = false;
2044
32.1k
  const char *SpellingPtr = &SpellingBuffer[0];
2045
32.1k
  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2046
32.1k
                                       &StringInvalid);
2047
32.1k
  if (StringInvalid)
2048
0
    return 0;
2049
2050
32.1k
  const char *SpellingStart = SpellingPtr;
2051
32.1k
  const char *SpellingEnd = SpellingPtr+TokLen;
2052
2053
  // Handle UTF-8 strings just like narrow strings.
2054
32.1k
  if (SpellingPtr[0] == 'u' && 
SpellingPtr[1] == '8'3
)
2055
3
    SpellingPtr += 2;
2056
2057
32.1k
  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2058
32.1k
         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2059
2060
  // For raw string literals, this is easy.
2061
32.1k
  if (SpellingPtr[0] == 'R') {
2062
6
    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2063
    // Skip 'R"'.
2064
0
    SpellingPtr += 2;
2065
35
    while (*SpellingPtr != '(') {
2066
29
      ++SpellingPtr;
2067
29
      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2068
29
    }
2069
    // Skip '('.
2070
6
    ++SpellingPtr;
2071
6
    return SpellingPtr - SpellingStart + ByteNo;
2072
6
  }
2073
2074
  // Skip over the leading quote
2075
32.1k
  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2076
0
  ++SpellingPtr;
2077
2078
  // Skip over bytes until we find the offset we're looking for.
2079
498k
  while (ByteNo) {
2080
466k
    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2081
2082
    // Step over non-escapes simply.
2083
466k
    if (*SpellingPtr != '\\') {
2084
465k
      ++SpellingPtr;
2085
465k
      --ByteNo;
2086
465k
      continue;
2087
465k
    }
2088
2089
    // Otherwise, this is an escape character.  Advance over it.
2090
856
    bool HadError = false;
2091
856
    if (SpellingPtr[1] == 'u' || 
SpellingPtr[1] == 'U'853
) {
2092
6
      const char *EscapePtr = SpellingPtr;
2093
6
      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2094
6
                                      1, Features, HadError);
2095
6
      if (Len > ByteNo) {
2096
        // ByteNo is somewhere within the escape sequence.
2097
6
        SpellingPtr = EscapePtr;
2098
6
        break;
2099
6
      }
2100
0
      ByteNo -= Len;
2101
850
    } else {
2102
850
      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2103
850
                        FullSourceLoc(Tok.getLocation(), SM),
2104
850
                        CharByteWidth*8, Diags, Features);
2105
850
      --ByteNo;
2106
850
    }
2107
850
    assert(!HadError && "This method isn't valid on erroneous strings");
2108
850
  }
2109
2110
32.1k
  return SpellingPtr-SpellingStart;
2111
32.1k
}
2112
2113
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2114
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
2115
/// treat it as an invalid suffix.
2116
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2117
585
                                          StringRef Suffix) {
2118
585
  return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2119
585
         
Suffix == "sv"61
;
2120
585
}