Coverage Report

Created: 2021-08-24 07:12

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Lex/LiteralSupport.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file implements the NumericLiteralParser, CharLiteralParser, and
10
// StringLiteralParser interfaces.
11
//
12
//===----------------------------------------------------------------------===//
13
14
#include "clang/Lex/LiteralSupport.h"
15
#include "clang/Basic/CharInfo.h"
16
#include "clang/Basic/LangOptions.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/TargetInfo.h"
19
#include "clang/Lex/LexDiagnostic.h"
20
#include "clang/Lex/Lexer.h"
21
#include "clang/Lex/Preprocessor.h"
22
#include "clang/Lex/Token.h"
23
#include "llvm/ADT/APInt.h"
24
#include "llvm/ADT/SmallVector.h"
25
#include "llvm/ADT/StringExtras.h"
26
#include "llvm/ADT/StringSwitch.h"
27
#include "llvm/Support/ConvertUTF.h"
28
#include "llvm/Support/Error.h"
29
#include "llvm/Support/ErrorHandling.h"
30
#include <algorithm>
31
#include <cassert>
32
#include <cstddef>
33
#include <cstdint>
34
#include <cstring>
35
#include <string>
36
37
using namespace clang;
38
39
5.61M
static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
40
5.61M
  switch (kind) {
41
0
  default: llvm_unreachable("Unknown token type!");
42
4.44k
  case tok::char_constant:
43
5.61M
  case tok::string_literal:
44
5.61M
  case tok::utf8_char_constant:
45
5.61M
  case tok::utf8_string_literal:
46
5.61M
    return Target.getCharWidth();
47
129
  case tok::wide_char_constant:
48
1.23k
  case tok::wide_string_literal:
49
1.23k
    return Target.getWCharWidth();
50
34
  case tok::utf16_char_constant:
51
179
  case tok::utf16_string_literal:
52
179
    return Target.getChar16Width();
53
27
  case tok::utf32_char_constant:
54
171
  case tok::utf32_string_literal:
55
171
    return Target.getChar32Width();
56
5.61M
  }
57
5.61M
}
58
59
static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
60
                                           FullSourceLoc TokLoc,
61
                                           const char *TokBegin,
62
                                           const char *TokRangeBegin,
63
367
                                           const char *TokRangeEnd) {
64
367
  SourceLocation Begin =
65
367
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
66
367
                                   TokLoc.getManager(), Features);
67
367
  SourceLocation End =
68
367
    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
69
367
                                   TokLoc.getManager(), Features);
70
367
  return CharSourceRange::getCharRange(Begin, End);
71
367
}
72
73
/// Produce a diagnostic highlighting some portion of a literal.
74
///
75
/// Emits the diagnostic \p DiagID, highlighting the range of characters from
76
/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
77
/// a substring of a spelling buffer for the token beginning at \p TokBegin.
78
static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
79
                              const LangOptions &Features, FullSourceLoc TokLoc,
80
                              const char *TokBegin, const char *TokRangeBegin,
81
223
                              const char *TokRangeEnd, unsigned DiagID) {
82
223
  SourceLocation Begin =
83
223
    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
84
223
                                   TokLoc.getManager(), Features);
85
223
  return Diags->Report(Begin, DiagID) <<
86
223
    MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
87
223
}
88
89
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
90
/// either a character or a string literal.
91
static unsigned ProcessCharEscape(const char *ThisTokBegin,
92
                                  const char *&ThisTokBuf,
93
                                  const char *ThisTokEnd, bool &HadError,
94
                                  FullSourceLoc Loc, unsigned CharWidth,
95
                                  DiagnosticsEngine *Diags,
96
105k
                                  const LangOptions &Features) {
97
105k
  const char *EscapeBegin = ThisTokBuf;
98
99
  // Skip the '\' char.
100
105k
  ++ThisTokBuf;
101
102
  // We know that this character can't be off the end of the buffer, because
103
  // that would have been \", which would not have been the end of string.
104
105k
  unsigned ResultChar = *ThisTokBuf++;
105
105k
  switch (ResultChar) {
106
  // These map to themselves.
107
989
  
case '\\': 402
case '\'': 436
case '"': 988
case '?': break;
108
109
    // These have fixed mappings.
110
26
  case 'a':
111
    // TODO: K&R: the meaning of '\\a' is different in traditional C
112
26
    ResultChar = 7;
113
26
    break;
114
25
  case 'b':
115
25
    ResultChar = 8;
116
25
    break;
117
15
  case 'e':
118
15
    if (Diags)
119
15
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
120
15
           diag::ext_nonstandard_escape) << "e";
121
15
    ResultChar = 27;
122
15
    break;
123
1
  case 'E':
124
1
    if (Diags)
125
1
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
126
1
           diag::ext_nonstandard_escape) << "E";
127
1
    ResultChar = 27;
128
1
    break;
129
17
  case 'f':
130
17
    ResultChar = 12;
131
17
    break;
132
94.0k
  case 'n':
133
94.0k
    ResultChar = 10;
134
94.0k
    break;
135
64
  case 'r':
136
64
    ResultChar = 13;
137
64
    break;
138
6.01k
  case 't':
139
6.01k
    ResultChar = 9;
140
6.01k
    break;
141
29
  case 'v':
142
29
    ResultChar = 11;
143
29
    break;
144
1.15k
  case 'x': { // Hex escape.
145
1.15k
    ResultChar = 0;
146
1.15k
    if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)1.15k
) {
147
6
      if (Diags)
148
6
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
149
6
             diag::err_hex_escape_no_digits) << "x";
150
6
      HadError = true;
151
6
      break;
152
6
    }
153
154
    // Hex escapes are a maximal series of hex digits.
155
1.15k
    bool Overflow = false;
156
3.69k
    for (; ThisTokBuf != ThisTokEnd; 
++ThisTokBuf2.54k
) {
157
2.84k
      int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
158
2.84k
      if (CharVal == -1) 
break300
;
159
      // About to shift out a digit?
160
2.54k
      if (ResultChar & 0xF0000000)
161
0
        Overflow = true;
162
2.54k
      ResultChar <<= 4;
163
2.54k
      ResultChar |= CharVal;
164
2.54k
    }
165
166
    // See if any bits will be truncated when evaluated as a character.
167
1.15k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 01.06k
) {
168
0
      Overflow = true;
169
0
      ResultChar &= ~0U >> (32-CharWidth);
170
0
    }
171
172
    // Check for overflow.
173
1.15k
    if (Overflow && 
Diags0
) // Too many digits to fit in
174
0
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
175
0
           diag::err_escape_too_large) << 0;
176
1.15k
    break;
177
1.15k
  }
178
2.62k
  
case '0': 2.23k
case '1': 2.44k
case '2': 2.62k
case '3':
179
2.62k
  
case '4': 2.62k
case '5': 2.62k
case '6': 2.62k
case '7': {
180
    // Octal escapes.
181
2.62k
    --ThisTokBuf;
182
2.62k
    ResultChar = 0;
183
184
    // Octal escapes are a series of octal digits with maximum length 3.
185
    // "\0123" is a two digit sequence equal to "\012" "3".
186
2.62k
    unsigned NumDigits = 0;
187
2.89k
    do {
188
2.89k
      ResultChar <<= 3;
189
2.89k
      ResultChar |= *ThisTokBuf++ - '0';
190
2.89k
      ++NumDigits;
191
2.89k
    } while (ThisTokBuf != ThisTokEnd && 
NumDigits < 31.79k
&&
192
2.89k
             
ThisTokBuf[0] >= '0'1.77k
&&
ThisTokBuf[0] <= '7'1.73k
);
193
194
    // Check for overflow.  Reject '\777', but not L'\777'.
195
2.62k
    if (CharWidth != 32 && 
(ResultChar >> CharWidth) != 02.44k
) {
196
1
      if (Diags)
197
1
        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
198
1
             diag::err_escape_too_large) << 1;
199
1
      ResultChar &= ~0U >> (32-CharWidth);
200
1
    }
201
2.62k
    break;
202
2.62k
  }
203
204
    // Otherwise, these are not valid escapes.
205
28
  
case '(': 5
case '{': 10
case '[': 15
case '%':
206
    // GCC accepts these as extensions.  We warn about them as such though.
207
28
    if (Diags)
208
22
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
209
22
           diag::ext_nonstandard_escape)
210
22
        << std::string(1, ResultChar);
211
28
    break;
212
10
  default:
213
10
    if (!Diags)
214
0
      break;
215
216
10
    if (isPrintable(ResultChar))
217
8
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
218
8
           diag::ext_unknown_escape)
219
8
        << std::string(1, ResultChar);
220
2
    else
221
2
      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
222
2
           diag::ext_unknown_escape)
223
2
        << "x" + llvm::utohexstr(ResultChar);
224
10
    break;
225
105k
  }
226
227
105k
  return ResultChar;
228
105k
}
229
230
static void appendCodePoint(unsigned Codepoint,
231
175
                            llvm::SmallVectorImpl<char> &Str) {
232
175
  char ResultBuf[4];
233
175
  char *ResultPtr = ResultBuf;
234
175
  bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
235
175
  (void)Res;
236
175
  assert(Res && "Unexpected conversion failure");
237
0
  Str.append(ResultBuf, ResultPtr);
238
175
}
239
240
1.00k
void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
241
3.74k
  for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; 
++I2.73k
) {
242
2.73k
    if (*I != '\\') {
243
2.56k
      Buf.push_back(*I);
244
2.56k
      continue;
245
2.56k
    }
246
247
175
    ++I;
248
175
    assert(*I == 'u' || *I == 'U');
249
250
0
    unsigned NumHexDigits;
251
175
    if (*I == 'u')
252
154
      NumHexDigits = 4;
253
21
    else
254
21
      NumHexDigits = 8;
255
256
175
    assert(I + NumHexDigits <= E);
257
258
0
    uint32_t CodePoint = 0;
259
959
    for (++I; NumHexDigits != 0; 
++I, --NumHexDigits784
) {
260
784
      unsigned Value = llvm::hexDigitValue(*I);
261
784
      assert(Value != -1U);
262
263
0
      CodePoint <<= 4;
264
784
      CodePoint += Value;
265
784
    }
266
267
175
    appendCodePoint(CodePoint, Buf);
268
175
    --I;
269
175
  }
270
1.00k
}
271
272
/// ProcessUCNEscape - Read the Universal Character Name, check constraints and
273
/// return the UTF32.
274
static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
275
                             const char *ThisTokEnd,
276
                             uint32_t &UcnVal, unsigned short &UcnLen,
277
                             FullSourceLoc Loc, DiagnosticsEngine *Diags,
278
                             const LangOptions &Features,
279
390
                             bool in_char_string_literal = false) {
280
390
  const char *UcnBegin = ThisTokBuf;
281
282
  // Skip the '\u' char's.
283
390
  ThisTokBuf += 2;
284
285
390
  if (ThisTokBuf == ThisTokEnd || 
!isHexDigit(*ThisTokBuf)387
) {
286
4
    if (Diags)
287
4
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
288
4
           diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
289
4
    return false;
290
4
  }
291
386
  UcnLen = (ThisTokBuf[-1] == 'u' ? 
4303
:
883
);
292
386
  unsigned short UcnLenSave = UcnLen;
293
2.24k
  for (; ThisTokBuf != ThisTokEnd && 
UcnLenSave1.98k
;
++ThisTokBuf, UcnLenSave--1.85k
) {
294
1.86k
    int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
295
1.86k
    if (CharVal == -1) 
break4
;
296
1.85k
    UcnVal <<= 4;
297
1.85k
    UcnVal |= CharVal;
298
1.85k
  }
299
  // If we didn't consume the proper number of digits, there is a problem.
300
386
  if (UcnLenSave) {
301
6
    if (Diags)
302
6
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
303
6
           diag::err_ucn_escape_incomplete);
304
6
    return false;
305
6
  }
306
307
  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
308
380
  if ((0xD800 <= UcnVal && 
UcnVal <= 0xDFFF110
) || // surrogate codepoints
309
380
      
UcnVal > 0x10FFFF364
) { // maximum legal UTF32 value
310
19
    if (Diags)
311
19
      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
312
19
           diag::err_ucn_escape_invalid);
313
19
    return false;
314
19
  }
315
316
  // C++11 allows UCNs that refer to control characters and basic source
317
  // characters inside character and string literals
318
361
  if (UcnVal < 0xa0 &&
319
361
      
(115
UcnVal != 0x24115
&&
UcnVal != 0x40111
&&
UcnVal != 0x60107
)) { // $, @, `
320
103
    bool IsError = (!Features.CPlusPlus11 || 
!in_char_string_literal70
);
321
103
    if (Diags) {
322
103
      char BasicSCSChar = UcnVal;
323
103
      if (UcnVal >= 0x20 && 
UcnVal < 0x7f79
)
324
48
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
325
48
             IsError ? 
diag::err_ucn_escape_basic_scs18
:
326
48
                       
diag::warn_cxx98_compat_literal_ucn_escape_basic_scs30
)
327
48
            << StringRef(&BasicSCSChar, 1);
328
55
      else
329
55
        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
330
55
             IsError ? 
diag::err_ucn_control_character15
:
331
55
                       
diag::warn_cxx98_compat_literal_ucn_control_character40
);
332
103
    }
333
103
    if (IsError)
334
33
      return false;
335
103
  }
336
337
328
  if (!Features.CPlusPlus && 
!Features.C99119
&&
Diags2
)
338
2
    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
339
2
         diag::warn_ucn_not_valid_in_c89_literal);
340
341
328
  return true;
342
361
}
343
344
/// MeasureUCNEscape - Determine the number of bytes within the resulting string
345
/// which this UCN will occupy.
346
static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
347
                            const char *ThisTokEnd, unsigned CharByteWidth,
348
6
                            const LangOptions &Features, bool &HadError) {
349
  // UTF-32: 4 bytes per escape.
350
6
  if (CharByteWidth == 4)
351
0
    return 4;
352
353
6
  uint32_t UcnVal = 0;
354
6
  unsigned short UcnLen = 0;
355
6
  FullSourceLoc Loc;
356
357
6
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
358
6
                        UcnLen, Loc, nullptr, Features, true)) {
359
0
    HadError = true;
360
0
    return 0;
361
0
  }
362
363
  // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
364
6
  if (CharByteWidth == 2)
365
0
    return UcnVal <= 0xFFFF ? 2 : 4;
366
367
  // UTF-8.
368
6
  if (UcnVal < 0x80)
369
0
    return 1;
370
6
  if (UcnVal < 0x800)
371
0
    return 2;
372
6
  if (UcnVal < 0x10000)
373
3
    return 3;
374
3
  return 4;
375
6
}
376
377
/// EncodeUCNEscape - Read the Universal Character Name, check constraints and
378
/// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
379
/// StringLiteralParser. When we decide to implement UCN's for identifiers,
380
/// we will likely rework our support for UCN's.
381
static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
382
                            const char *ThisTokEnd,
383
                            char *&ResultBuf, bool &HadError,
384
                            FullSourceLoc Loc, unsigned CharByteWidth,
385
                            DiagnosticsEngine *Diags,
386
289
                            const LangOptions &Features) {
387
289
  typedef uint32_t UTF32;
388
289
  UTF32 UcnVal = 0;
389
289
  unsigned short UcnLen = 0;
390
289
  if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
391
289
                        Loc, Diags, Features, true)) {
392
33
    HadError = true;
393
33
    return;
394
33
  }
395
396
256
  assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
397
256
         "only character widths of 1, 2, or 4 bytes supported");
398
399
0
  (void)UcnLen;
400
256
  assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
401
402
256
  if (CharByteWidth == 4) {
403
    // FIXME: Make the type of the result buffer correct instead of
404
    // using reinterpret_cast.
405
69
    llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
406
69
    *ResultPtr = UcnVal;
407
69
    ResultBuf += 4;
408
69
    return;
409
69
  }
410
411
187
  if (CharByteWidth == 2) {
412
    // FIXME: Make the type of the result buffer correct instead of
413
    // using reinterpret_cast.
414
54
    llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
415
416
54
    if (UcnVal <= (UTF32)0xFFFF) {
417
40
      *ResultPtr = UcnVal;
418
40
      ResultBuf += 2;
419
40
      return;
420
40
    }
421
422
    // Convert to UTF16.
423
14
    UcnVal -= 0x10000;
424
14
    *ResultPtr     = 0xD800 + (UcnVal >> 10);
425
14
    *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
426
14
    ResultBuf += 4;
427
14
    return;
428
54
  }
429
430
133
  assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
431
432
  // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
433
  // The conversion below was inspired by:
434
  //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
435
  // First, we determine how many bytes the result will require.
436
0
  typedef uint8_t UTF8;
437
438
133
  unsigned short bytesToWrite = 0;
439
133
  if (UcnVal < (UTF32)0x80)
440
28
    bytesToWrite = 1;
441
105
  else if (UcnVal < (UTF32)0x800)
442
15
    bytesToWrite = 2;
443
90
  else if (UcnVal < (UTF32)0x10000)
444
66
    bytesToWrite = 3;
445
24
  else
446
24
    bytesToWrite = 4;
447
448
133
  const unsigned byteMask = 0xBF;
449
133
  const unsigned byteMark = 0x80;
450
451
  // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
452
  // into the first byte, depending on how many bytes follow.
453
133
  static const UTF8 firstByteMark[5] = {
454
133
    0x00, 0x00, 0xC0, 0xE0, 0xF0
455
133
  };
456
  // Finally, we write the bytes into ResultBuf.
457
133
  ResultBuf += bytesToWrite;
458
133
  switch (bytesToWrite) { // note: everything falls through.
459
24
  case 4:
460
24
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
461
24
    LLVM_FALLTHROUGH;
462
90
  case 3:
463
90
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
464
90
    LLVM_FALLTHROUGH;
465
105
  case 2:
466
105
    *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
467
105
    LLVM_FALLTHROUGH;
468
133
  case 1:
469
133
    *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
470
133
  }
471
  // Update the buffer.
472
133
  ResultBuf += bytesToWrite;
473
133
}
474
475
///       integer-constant: [C99 6.4.4.1]
476
///         decimal-constant integer-suffix
477
///         octal-constant integer-suffix
478
///         hexadecimal-constant integer-suffix
479
///         binary-literal integer-suffix [GNU, C++1y]
480
///       user-defined-integer-literal: [C++11 lex.ext]
481
///         decimal-literal ud-suffix
482
///         octal-literal ud-suffix
483
///         hexadecimal-literal ud-suffix
484
///         binary-literal ud-suffix [GNU, C++1y]
485
///       decimal-constant:
486
///         nonzero-digit
487
///         decimal-constant digit
488
///       octal-constant:
489
///         0
490
///         octal-constant octal-digit
491
///       hexadecimal-constant:
492
///         hexadecimal-prefix hexadecimal-digit
493
///         hexadecimal-constant hexadecimal-digit
494
///       hexadecimal-prefix: one of
495
///         0x 0X
496
///       binary-literal:
497
///         0b binary-digit
498
///         0B binary-digit
499
///         binary-literal binary-digit
500
///       integer-suffix:
501
///         unsigned-suffix [long-suffix]
502
///         unsigned-suffix [long-long-suffix]
503
///         long-suffix [unsigned-suffix]
504
///         long-long-suffix [unsigned-sufix]
505
///       nonzero-digit:
506
///         1 2 3 4 5 6 7 8 9
507
///       octal-digit:
508
///         0 1 2 3 4 5 6 7
509
///       hexadecimal-digit:
510
///         0 1 2 3 4 5 6 7 8 9
511
///         a b c d e f
512
///         A B C D E F
513
///       binary-digit:
514
///         0
515
///         1
516
///       unsigned-suffix: one of
517
///         u U
518
///       long-suffix: one of
519
///         l L
520
///       long-long-suffix: one of
521
///         ll LL
522
///
523
///       floating-constant: [C99 6.4.4.2]
524
///         TODO: add rules...
525
///
526
NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
527
                                           SourceLocation TokLoc,
528
                                           const SourceManager &SM,
529
                                           const LangOptions &LangOpts,
530
                                           const TargetInfo &Target,
531
                                           DiagnosticsEngine &Diags)
532
    : SM(SM), LangOpts(LangOpts), Diags(Diags),
533
6.27M
      ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
534
535
  // This routine assumes that the range begin/end matches the regex for integer
536
  // and FP constants (specifically, the 'pp-number' regex), and assumes that
537
  // the byte at "*end" is both valid and not part of the regex.  Because of
538
  // this, it doesn't have to check for 'overscan' in various places.
539
6.27M
  assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
540
541
0
  s = DigitsBegin = ThisTokBegin;
542
6.27M
  saw_exponent = false;
543
6.27M
  saw_period = false;
544
6.27M
  saw_ud_suffix = false;
545
6.27M
  saw_fixed_point_suffix = false;
546
6.27M
  isLong = false;
547
6.27M
  isUnsigned = false;
548
6.27M
  isLongLong = false;
549
6.27M
  isSizeT = false;
550
6.27M
  isHalf = false;
551
6.27M
  isFloat = false;
552
6.27M
  isImaginary = false;
553
6.27M
  isFloat16 = false;
554
6.27M
  isFloat128 = false;
555
6.27M
  MicrosoftInteger = 0;
556
6.27M
  isFract = false;
557
6.27M
  isAccum = false;
558
6.27M
  hadError = false;
559
560
6.27M
  if (*s == '0') { // parse radix
561
1.20M
    ParseNumberStartingWithZero(TokLoc);
562
1.20M
    if (hadError)
563
45
      return;
564
5.06M
  } else { // the first digit is non-zero
565
5.06M
    radix = 10;
566
5.06M
    s = SkipDigits(s);
567
5.06M
    if (s == ThisTokEnd) {
568
      // Done.
569
4.69M
    } else {
570
375k
      ParseDecimalOrOctalCommon(TokLoc);
571
375k
      if (hadError)
572
11
        return;
573
375k
    }
574
5.06M
  }
575
576
6.27M
  SuffixBegin = s;
577
6.27M
  checkSeparator(TokLoc, s, CSK_AfterDigits);
578
579
  // Initial scan to lookahead for fixed point suffix.
580
6.27M
  if (LangOpts.FixedPoint) {
581
1.81k
    for (const char *c = s; c != ThisTokEnd; 
++c886
) {
582
1.74k
      if (*c == 'r' || 
*c == 'k'1.52k
||
*c == 'R'886
||
*c == 'K'886
) {
583
858
        saw_fixed_point_suffix = true;
584
858
        break;
585
858
      }
586
1.74k
    }
587
928
  }
588
589
  // Parse the suffix.  At this point we can classify whether we have an FP or
590
  // integer constant.
591
6.27M
  bool isFixedPointConstant = isFixedPointLiteral();
592
6.27M
  bool isFPConstant = isFloatingLiteral();
593
6.27M
  bool HasSize = false;
594
595
  // Loop over all of the characters of the suffix.  If we see something bad,
596
  // we break out of the loop.
597
6.77M
  for (; s != ThisTokEnd; 
++s500k
) {
598
500k
    switch (*s) {
599
0
    case 'R':
600
224
    case 'r':
601
224
      if (!LangOpts.FixedPoint)
602
6
        break;
603
218
      if (isFract || 
isAccum217
)
break1
;
604
217
      if (!(saw_period || 
saw_exponent20
))
break14
;
605
203
      isFract = true;
606
203
      continue;
607
0
    case 'K':
608
632
    case 'k':
609
632
      if (!LangOpts.FixedPoint)
610
6
        break;
611
626
      if (isFract || 
isAccum625
)
break2
;
612
624
      if (!(saw_period || 
saw_exponent43
))
break16
;
613
608
      isAccum = true;
614
608
      continue;
615
514
    case 'h':      // FP Suffix for "half".
616
516
    case 'H':
617
      // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
618
516
      if (!(LangOpts.Half || 
LangOpts.FixedPoint495
))
619
3
        break;
620
513
      if (isIntegerLiteral()) 
break13
; // Error for integer constant.
621
500
      if (HasSize)
622
3
        break;
623
497
      HasSize = true;
624
497
      isHalf = true;
625
497
      continue;  // Success.
626
4.36k
    case 'f':      // FP Suffix for "float"
627
7.24k
    case 'F':
628
7.24k
      if (!isFPConstant) 
break4
; // Error for integer constant.
629
7.24k
      if (HasSize)
630
1
        break;
631
7.24k
      HasSize = true;
632
633
      // CUDA host and device may have different _Float16 support, therefore
634
      // allows f16 literals to avoid false alarm.
635
      // ToDo: more precise check for CUDA.
636
7.24k
      if ((Target.hasFloat16Type() || 
LangOpts.CUDA6.74k
) &&
s + 2 < ThisTokEnd574
&&
637
7.24k
          
s[1] == '1'154
&&
s[2] == '6'154
) {
638
154
        s += 2; // success, eat up 2 characters.
639
154
        isFloat16 = true;
640
154
        continue;
641
154
      }
642
643
7.09k
      isFloat = true;
644
7.09k
      continue;  // Success.
645
171
    case 'q':    // FP Suffix for "__float128"
646
173
    case 'Q':
647
173
      if (!isFPConstant) 
break1
; // Error for integer constant.
648
172
      if (HasSize)
649
0
        break;
650
172
      HasSize = true;
651
172
      isFloat128 = true;
652
172
      continue;  // Success.
653
17.1k
    case 'u':
654
106k
    case 'U':
655
106k
      if (isFPConstant) 
break8
; // Error for floating constant.
656
106k
      if (isUnsigned) 
break0
; // Cannot be repeated.
657
106k
      isUnsigned = true;
658
106k
      continue;  // Success.
659
5.83k
    case 'l':
660
384k
    case 'L':
661
384k
      if (HasSize)
662
16
        break;
663
384k
      HasSize = true;
664
665
      // Check for long long.  The L's need to be adjacent and the same case.
666
384k
      if (s[1] == s[0]) {
667
43.0k
        assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
668
43.0k
        if (isFPConstant) 
break0
; // long long invalid for floats.
669
43.0k
        isLongLong = true;
670
43.0k
        ++s;  // Eat both of them.
671
341k
      } else {
672
341k
        isLong = true;
673
341k
      }
674
384k
      continue; // Success.
675
384k
    case 'z':
676
150
    case 'Z':
677
150
      if (isFPConstant)
678
12
        break; // Invalid for floats.
679
138
      if (HasSize)
680
20
        break;
681
118
      HasSize = true;
682
118
      isSizeT = true;
683
118
      continue;
684
336
    case 'i':
685
341
    case 'I':
686
341
      if (LangOpts.MicrosoftExt && 
!isFPConstant70
) {
687
        // Allow i8, i16, i32, and i64. First, look ahead and check if
688
        // suffixes are Microsoft integers and not the imaginary unit.
689
62
        uint8_t Bits = 0;
690
62
        size_t ToSkip = 0;
691
62
        switch (s[1]) {
692
10
        case '8': // i8 suffix
693
10
          Bits = 8;
694
10
          ToSkip = 2;
695
10
          break;
696
9
        case '1':
697
9
          if (s[2] == '6') { // i16 suffix
698
9
            Bits = 16;
699
9
            ToSkip = 3;
700
9
          }
701
9
          break;
702
9
        case '3':
703
9
          if (s[2] == '2') { // i32 suffix
704
9
            Bits = 32;
705
9
            ToSkip = 3;
706
9
          }
707
9
          break;
708
31
        case '6':
709
31
          if (s[2] == '4') { // i64 suffix
710
31
            Bits = 64;
711
31
            ToSkip = 3;
712
31
          }
713
31
          break;
714
3
        default:
715
3
          break;
716
62
        }
717
62
        if (Bits) {
718
59
          if (HasSize)
719
6
            break;
720
53
          HasSize = true;
721
53
          MicrosoftInteger = Bits;
722
53
          s += ToSkip;
723
53
          assert(s <= ThisTokEnd && "didn't maximally munch?");
724
0
          break;
725
59
        }
726
62
      }
727
341
      
LLVM_FALLTHROUGH282
;282
728
408
    case 'j':
729
408
    case 'J':
730
408
      if (isImaginary) 
break0
; // Cannot be repeated.
731
408
      isImaginary = true;
732
408
      continue;  // Success.
733
500k
    }
734
    // If we reached here, there was an error or a ud-suffix.
735
427
    break;
736
500k
  }
737
738
  // "i", "if", and "il" are user-defined suffixes in C++1y.
739
6.27M
  if (s != ThisTokEnd || 
isImaginary6.27M
) {
740
    // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
741
782
    expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
742
782
    if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
743
247
      if (!isImaginary) {
744
        // Any suffix pieces we might have parsed are actually part of the
745
        // ud-suffix.
746
201
        isLong = false;
747
201
        isUnsigned = false;
748
201
        isLongLong = false;
749
201
        isSizeT = false;
750
201
        isFloat = false;
751
201
        isFloat16 = false;
752
201
        isHalf = false;
753
201
        isImaginary = false;
754
201
        MicrosoftInteger = 0;
755
201
        saw_fixed_point_suffix = false;
756
201
        isFract = false;
757
201
        isAccum = false;
758
201
      }
759
760
247
      saw_ud_suffix = true;
761
247
      return;
762
247
    }
763
764
535
    if (s != ThisTokEnd) {
765
      // Report an error if there are any.
766
173
      Diags.Report(Lexer::AdvanceToTokenCharacter(
767
173
                       TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
768
173
                   diag::err_invalid_suffix_constant)
769
173
          << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
770
173
          << (isFixedPointConstant ? 
27
:
isFPConstant166
);
771
173
      hadError = true;
772
173
    }
773
535
  }
774
775
6.27M
  if (!hadError && 
saw_fixed_point_suffix6.27M
) {
776
808
    assert(isFract || isAccum);
777
808
  }
778
6.27M
}
779
780
/// ParseDecimalOrOctalCommon - This method is called for decimal or octal
781
/// numbers. It issues an error for illegal digits, and handles floating point
782
/// parsing. If it detects a floating point number, the radix is set to 10.
783
399k
void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
784
399k
  assert((radix == 8 || radix == 10) && "Unexpected radix");
785
786
  // If we have a hex digit other than 'e' (which denotes a FP exponent) then
787
  // the code is using an incorrect base.
788
399k
  if (isHexDigit(*s) && 
*s != 'e'357
&&
*s != 'E'45
&&
789
399k
      
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))12
) {
790
10
    Diags.Report(
791
10
        Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
792
10
        diag::err_invalid_digit)
793
10
        << StringRef(s, 1) << (radix == 8 ? 
19
:
01
);
794
10
    hadError = true;
795
10
    return;
796
10
  }
797
798
399k
  if (*s == '.') {
799
37.6k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
800
37.6k
    s++;
801
37.6k
    radix = 10;
802
37.6k
    saw_period = true;
803
37.6k
    checkSeparator(TokLoc, s, CSK_BeforeDigits);
804
37.6k
    s = SkipDigits(s); // Skip suffix.
805
37.6k
  }
806
399k
  if (*s == 'e' || 
*s == 'E'392k
) { // exponent
807
6.72k
    checkSeparator(TokLoc, s, CSK_AfterDigits);
808
6.72k
    const char *Exponent = s;
809
6.72k
    s++;
810
6.72k
    radix = 10;
811
6.72k
    saw_exponent = true;
812
6.72k
    if (s != ThisTokEnd && 
(6.71k
*s == '+'6.71k
||
*s == '-'5.31k
))
s++6.34k
; // sign
813
6.72k
    const char *first_non_digit = SkipDigits(s);
814
6.72k
    if (containsDigits(s, first_non_digit)) {
815
6.71k
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
816
6.71k
      s = first_non_digit;
817
6.71k
    } else {
818
6
      if (!hadError) {
819
4
        Diags.Report(Lexer::AdvanceToTokenCharacter(
820
4
                         TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
821
4
                     diag::err_exponent_has_no_digits);
822
4
        hadError = true;
823
4
      }
824
6
      return;
825
6
    }
826
6.72k
  }
827
399k
}
828
829
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
830
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
831
/// treat it as an invalid suffix.
832
bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
833
1.34k
                                           StringRef Suffix) {
834
1.34k
  if (!LangOpts.CPlusPlus11 || 
Suffix.empty()1.04k
)
835
301
    return false;
836
837
  // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
838
1.04k
  if (Suffix[0] == '_')
839
187
    return true;
840
841
  // In C++11, there are no library suffixes.
842
855
  if (!LangOpts.CPlusPlus14)
843
35
    return false;
844
845
  // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
846
  // Per tweaked N3660, "il", "i", and "if" are also used in the library.
847
  // In C++2a "d" and "y" are used in the library.
848
820
  return llvm::StringSwitch<bool>(Suffix)
849
820
      .Cases("h", "min", "s", true)
850
820
      .Cases("ms", "us", "ns", true)
851
820
      .Cases("il", "i", "if", true)
852
820
      .Cases("d", "y", LangOpts.CPlusPlus20)
853
820
      .Default(false);
854
855
}
855
856
void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
857
                                          const char *Pos,
858
6.36M
                                          CheckSeparatorKind IsAfterDigits) {
859
6.36M
  if (IsAfterDigits == CSK_AfterDigits) {
860
6.31M
    if (Pos == ThisTokBegin)
861
542
      return;
862
6.31M
    --Pos;
863
6.31M
  } else 
if (44.6k
Pos == ThisTokEnd44.6k
)
864
598
    return;
865
866
6.36M
  if (isDigitSeparator(*Pos)) {
867
36
    Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
868
36
                                                LangOpts),
869
36
                 diag::err_digit_separator_not_between_digits)
870
36
        << IsAfterDigits;
871
36
    hadError = true;
872
36
  }
873
6.36M
}
874
875
/// ParseNumberStartingWithZero - This method is called when the first character
876
/// of the number is found to be a zero.  This means it is either an octal
877
/// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
878
/// a floating point number (01239.123e4).  Eat the prefix, determining the
879
/// radix etc.
880
1.20M
void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
881
1.20M
  assert(s[0] == '0' && "Invalid method call");
882
0
  s++;
883
884
1.20M
  int c1 = s[0];
885
886
  // Handle a hex number like 0x1234.
887
1.20M
  if ((c1 == 'x' || 
c1 == 'X'254k
) &&
(949k
isHexDigit(s[1])949k
||
s[1] == '.'22
)) {
888
949k
    s++;
889
949k
    assert(s < ThisTokEnd && "didn't maximally munch?");
890
0
    radix = 16;
891
949k
    DigitsBegin = s;
892
949k
    s = SkipHexDigits(s);
893
949k
    bool HasSignificandDigits = containsDigits(DigitsBegin, s);
894
949k
    if (s == ThisTokEnd) {
895
      // Done.
896
901k
    } else 
if (47.7k
*s == '.'47.7k
) {
897
133
      s++;
898
133
      saw_period = true;
899
133
      const char *floatDigitsBegin = s;
900
133
      s = SkipHexDigits(s);
901
133
      if (containsDigits(floatDigitsBegin, s))
902
119
        HasSignificandDigits = true;
903
133
      if (HasSignificandDigits)
904
127
        checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
905
133
    }
906
907
949k
    if (!HasSignificandDigits) {
908
6
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
909
6
                                                  LangOpts),
910
6
                   diag::err_hex_constant_requires)
911
6
          << LangOpts.CPlusPlus << 1;
912
6
      hadError = true;
913
6
      return;
914
6
    }
915
916
    // A binary exponent can appear with or with a '.'. If dotted, the
917
    // binary exponent is required.
918
949k
    if (*s == 'p' || 
*s == 'P'949k
) {
919
165
      checkSeparator(TokLoc, s, CSK_AfterDigits);
920
165
      const char *Exponent = s;
921
165
      s++;
922
165
      saw_exponent = true;
923
165
      if (s != ThisTokEnd && 
(163
*s == '+'163
||
*s == '-'143
))
s++44
; // sign
924
165
      const char *first_non_digit = SkipDigits(s);
925
165
      if (!containsDigits(s, first_non_digit)) {
926
4
        if (!hadError) {
927
2
          Diags.Report(Lexer::AdvanceToTokenCharacter(
928
2
                           TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
929
2
                       diag::err_exponent_has_no_digits);
930
2
          hadError = true;
931
2
        }
932
4
        return;
933
4
      }
934
161
      checkSeparator(TokLoc, s, CSK_BeforeDigits);
935
161
      s = first_non_digit;
936
937
161
      if (!LangOpts.HexFloats)
938
45
        Diags.Report(TokLoc, LangOpts.CPlusPlus
939
45
                                 ? 
diag::ext_hex_literal_invalid42
940
45
                                 : 
diag::ext_hex_constant_invalid3
);
941
116
      else if (LangOpts.CPlusPlus17)
942
31
        Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
943
949k
    } else if (saw_period) {
944
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
945
2
                                                  LangOpts),
946
2
                   diag::err_hex_constant_requires)
947
2
          << LangOpts.CPlusPlus << 0;
948
2
      hadError = true;
949
2
    }
950
949k
    return;
951
949k
  }
952
953
  // Handle simple binary numbers 0b01010
954
253k
  if ((c1 == 'b' || 
c1 == 'B'253k
) &&
(90
s[1] == '0'90
||
s[1] == '1'66
)) {
955
    // 0b101010 is a C++1y / GCC extension.
956
85
    Diags.Report(TokLoc, LangOpts.CPlusPlus14
957
85
                             ? 
diag::warn_cxx11_compat_binary_literal49
958
85
                         : 
LangOpts.CPlusPlus36
?
diag::ext_binary_literal_cxx1419
959
36
                                              : 
diag::ext_binary_literal17
);
960
85
    ++s;
961
85
    assert(s < ThisTokEnd && "didn't maximally munch?");
962
0
    radix = 2;
963
85
    DigitsBegin = s;
964
85
    s = SkipBinaryDigits(s);
965
85
    if (s == ThisTokEnd) {
966
      // Done.
967
66
    } else 
if (19
isHexDigit(*s)19
&&
968
19
               
!isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))3
) {
969
2
      Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
970
2
                                                  LangOpts),
971
2
                   diag::err_invalid_digit)
972
2
          << StringRef(s, 1) << 2;
973
2
      hadError = true;
974
2
    }
975
    // Other suffixes will be diagnosed by the caller.
976
85
    return;
977
85
  }
978
979
  // For now, the radix is set to 8. If we discover that we have a
980
  // floating point constant, the radix will change to 10. Octal floating
981
  // point constants are not permitted (only decimal and hexadecimal).
982
253k
  radix = 8;
983
253k
  DigitsBegin = s;
984
253k
  s = SkipOctalDigits(s);
985
253k
  if (s == ThisTokEnd)
986
230k
    return; // Done, simple octal number like 01234
987
988
  // If we have some other non-octal digit that *is* a decimal digit, see if
989
  // this is part of a floating point number like 094.123 or 09e1.
990
23.6k
  if (isDigit(*s)) {
991
3
    const char *EndDecimal = SkipDigits(s);
992
3
    if (EndDecimal[0] == '.' || 
EndDecimal[0] == 'e'2
||
EndDecimal[0] == 'E'2
) {
993
1
      s = EndDecimal;
994
1
      radix = 10;
995
1
    }
996
3
  }
997
998
23.6k
  ParseDecimalOrOctalCommon(TokLoc);
999
23.6k
}
1000
1001
6.23M
static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1002
6.23M
  switch (Radix) {
1003
82
  case 2:
1004
82
    return NumDigits <= 64;
1005
238k
  case 8:
1006
238k
    return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1007
5.04M
  case 10:
1008
5.04M
    return NumDigits <= 19; // floor(log10(2^64))
1009
949k
  case 16:
1010
949k
    return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1011
0
  default:
1012
0
    llvm_unreachable("impossible Radix");
1013
6.23M
  }
1014
6.23M
}
1015
1016
/// GetIntegerValue - Convert this numeric literal value to an APInt that
1017
/// matches Val's input width.  If there is an overflow, set Val to the low bits
1018
/// of the result and return true.  Otherwise, return false.
1019
6.23M
bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1020
  // Fast path: Compute a conservative bound on the maximum number of
1021
  // bits per digit in this radix. If we can't possibly overflow a
1022
  // uint64 based on that bound then do the simple conversion to
1023
  // integer. This avoids the expensive overflow checking below, and
1024
  // handles the common cases that matter (small decimal integers and
1025
  // hex/octal values which don't overflow).
1026
6.23M
  const unsigned NumDigits = SuffixBegin - DigitsBegin;
1027
6.23M
  if (alwaysFitsInto64Bits(radix, NumDigits)) {
1028
6.23M
    uint64_t N = 0;
1029
25.0M
    for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; 
++Ptr18.8M
)
1030
18.8M
      if (!isDigitSeparator(*Ptr))
1031
18.8M
        N = N * radix + llvm::hexDigitValue(*Ptr);
1032
1033
    // This will truncate the value to Val's input width. Simply check
1034
    // for overflow by comparing.
1035
6.23M
    Val = N;
1036
6.23M
    return Val.getZExtValue() != N;
1037
6.23M
  }
1038
1039
187
  Val = 0;
1040
187
  const char *Ptr = DigitsBegin;
1041
1042
187
  llvm::APInt RadixVal(Val.getBitWidth(), radix);
1043
187
  llvm::APInt CharVal(Val.getBitWidth(), 0);
1044
187
  llvm::APInt OldVal = Val;
1045
1046
187
  bool OverflowOccurred = false;
1047
3.96k
  while (Ptr < SuffixBegin) {
1048
3.77k
    if (isDigitSeparator(*Ptr)) {
1049
30
      ++Ptr;
1050
30
      continue;
1051
30
    }
1052
1053
3.74k
    unsigned C = llvm::hexDigitValue(*Ptr++);
1054
1055
    // If this letter is out of bound for this radix, reject it.
1056
3.74k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1057
1058
0
    CharVal = C;
1059
1060
    // Add the digit to the value in the appropriate radix.  If adding in digits
1061
    // made the value smaller, then this overflowed.
1062
3.74k
    OldVal = Val;
1063
1064
    // Multiply by radix, did overflow occur on the multiply?
1065
3.74k
    Val *= RadixVal;
1066
3.74k
    OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1067
1068
    // Add value, did overflow occur on the value?
1069
    //   (a + b) ult b  <=> overflow
1070
3.74k
    Val += CharVal;
1071
3.74k
    OverflowOccurred |= Val.ult(CharVal);
1072
3.74k
  }
1073
187
  return OverflowOccurred;
1074
6.23M
}
1075
1076
llvm::APFloat::opStatus
1077
37.1k
NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1078
37.1k
  using llvm::APFloat;
1079
1080
37.1k
  unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1081
1082
37.1k
  llvm::SmallString<16> Buffer;
1083
37.1k
  StringRef Str(ThisTokBegin, n);
1084
37.1k
  if (Str.find('\'') != StringRef::npos) {
1085
6
    Buffer.reserve(n);
1086
6
    std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1087
6
                        &isDigitSeparator);
1088
6
    Str = Buffer;
1089
6
  }
1090
1091
37.1k
  auto StatusOrErr =
1092
37.1k
      Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1093
37.1k
  assert(StatusOrErr && "Invalid floating point representation");
1094
37.1k
  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1095
37.1k
                                               : 
APFloat::opInvalidOp0
;
1096
37.1k
}
1097
1098
270
static inline bool IsExponentPart(char c) {
1099
270
  return c == 'p' || 
c == 'P'232
||
c == 'e'228
||
c == 'E'207
;
1100
270
}
1101
1102
808
bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1103
808
  assert(radix == 16 || radix == 10);
1104
1105
  // Find how many digits are needed to store the whole literal.
1106
0
  unsigned NumDigits = SuffixBegin - DigitsBegin;
1107
808
  if (saw_period) 
--NumDigits775
;
1108
1109
  // Initial scan of the exponent if it exists
1110
808
  bool ExpOverflowOccurred = false;
1111
808
  bool NegativeExponent = false;
1112
808
  const char *ExponentBegin;
1113
808
  uint64_t Exponent = 0;
1114
808
  int64_t BaseShift = 0;
1115
808
  if (saw_exponent) {
1116
67
    const char *Ptr = DigitsBegin;
1117
1118
270
    while (!IsExponentPart(*Ptr)) 
++Ptr203
;
1119
67
    ExponentBegin = Ptr;
1120
67
    ++Ptr;
1121
67
    NegativeExponent = *Ptr == '-';
1122
67
    if (NegativeExponent) 
++Ptr26
;
1123
1124
67
    unsigned NumExpDigits = SuffixBegin - Ptr;
1125
67
    if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1126
66
      llvm::StringRef ExpStr(Ptr, NumExpDigits);
1127
66
      llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1128
66
      Exponent = ExpInt.getZExtValue();
1129
66
    } else {
1130
1
      ExpOverflowOccurred = true;
1131
1
    }
1132
1133
67
    if (NegativeExponent) 
BaseShift -= Exponent26
;
1134
41
    else BaseShift += Exponent;
1135
67
  }
1136
1137
  // Number of bits needed for decimal literal is
1138
  //   ceil(NumDigits * log2(10))       Integral part
1139
  // + Scale                            Fractional part
1140
  // + ceil(Exponent * log2(10))        Exponent
1141
  // --------------------------------------------------
1142
  //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1143
  //
1144
  // But for simplicity in handling integers, we can round up log2(10) to 4,
1145
  // making:
1146
  // 4 * (NumDigits + Exponent) + Scale
1147
  //
1148
  // Number of digits needed for hexadecimal literal is
1149
  //   4 * NumDigits                    Integral part
1150
  // + Scale                            Fractional part
1151
  // + Exponent                         Exponent
1152
  // --------------------------------------------------
1153
  //   (4 * NumDigits) + Scale + Exponent
1154
808
  uint64_t NumBitsNeeded;
1155
808
  if (radix == 10)
1156
766
    NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1157
42
  else
1158
42
    NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1159
1160
808
  if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1161
0
    ExpOverflowOccurred = true;
1162
808
  llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1163
1164
808
  bool FoundDecimal = false;
1165
1166
808
  int64_t FractBaseShift = 0;
1167
808
  const char *End = saw_exponent ? 
ExponentBegin67
:
SuffixBegin741
;
1168
4.08k
  for (const char *Ptr = DigitsBegin; Ptr < End; 
++Ptr3.27k
) {
1169
3.27k
    if (*Ptr == '.') {
1170
775
      FoundDecimal = true;
1171
775
      continue;
1172
775
    }
1173
1174
    // Normal reading of an integer
1175
2.50k
    unsigned C = llvm::hexDigitValue(*Ptr);
1176
2.50k
    assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1177
1178
0
    Val *= radix;
1179
2.50k
    Val += C;
1180
1181
2.50k
    if (FoundDecimal)
1182
      // Keep track of how much we will need to adjust this value by from the
1183
      // number of digits past the radix point.
1184
1.63k
      --FractBaseShift;
1185
2.50k
  }
1186
1187
  // For a radix of 16, we will be multiplying by 2 instead of 16.
1188
808
  if (radix == 16) 
FractBaseShift *= 442
;
1189
808
  BaseShift += FractBaseShift;
1190
1191
808
  Val <<= Scale;
1192
1193
808
  uint64_t Base = (radix == 16) ? 
242
:
10766
;
1194
808
  if (BaseShift > 0) {
1195
146
    for (int64_t i = 0; i < BaseShift; 
++i136
) {
1196
136
      Val *= Base;
1197
136
    }
1198
798
  } else if (BaseShift < 0) {
1199
2.80k
    for (int64_t i = BaseShift; i < 0 && 
!Val.isNullValue()2.05k
;
++i2.02k
)
1200
2.02k
      Val = Val.udiv(Base);
1201
781
  }
1202
1203
808
  bool IntOverflowOccurred = false;
1204
808
  auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1205
808
  if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1206
407
    IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1207
407
    StoreVal = Val.trunc(StoreVal.getBitWidth());
1208
407
  } else 
if (401
Val.getBitWidth() < StoreVal.getBitWidth()401
) {
1209
374
    IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1210
374
    StoreVal = Val.zext(StoreVal.getBitWidth());
1211
374
  } else {
1212
27
    StoreVal = Val;
1213
27
  }
1214
1215
808
  return IntOverflowOccurred || 
ExpOverflowOccurred778
;
1216
808
}
1217
1218
/// \verbatim
1219
///       user-defined-character-literal: [C++11 lex.ext]
1220
///         character-literal ud-suffix
1221
///       ud-suffix:
1222
///         identifier
1223
///       character-literal: [C++11 lex.ccon]
1224
///         ' c-char-sequence '
1225
///         u' c-char-sequence '
1226
///         U' c-char-sequence '
1227
///         L' c-char-sequence '
1228
///         u8' c-char-sequence ' [C++1z lex.ccon]
1229
///       c-char-sequence:
1230
///         c-char
1231
///         c-char-sequence c-char
1232
///       c-char:
1233
///         any member of the source character set except the single-quote ',
1234
///           backslash \, or new-line character
1235
///         escape-sequence
1236
///         universal-character-name
1237
///       escape-sequence:
1238
///         simple-escape-sequence
1239
///         octal-escape-sequence
1240
///         hexadecimal-escape-sequence
1241
///       simple-escape-sequence:
1242
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1243
///       octal-escape-sequence:
1244
///         \ octal-digit
1245
///         \ octal-digit octal-digit
1246
///         \ octal-digit octal-digit octal-digit
1247
///       hexadecimal-escape-sequence:
1248
///         \x hexadecimal-digit
1249
///         hexadecimal-escape-sequence hexadecimal-digit
1250
///       universal-character-name: [C++11 lex.charset]
1251
///         \u hex-quad
1252
///         \U hex-quad hex-quad
1253
///       hex-quad:
1254
///         hex-digit hex-digit hex-digit hex-digit
1255
/// \endverbatim
1256
///
1257
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1258
                                     SourceLocation Loc, Preprocessor &PP,
1259
549k
                                     tok::TokenKind kind) {
1260
  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1261
549k
  HadError = false;
1262
1263
549k
  Kind = kind;
1264
1265
549k
  const char *TokBegin = begin;
1266
1267
  // Skip over wide character determinant.
1268
549k
  if (Kind != tok::char_constant)
1269
1.40k
    ++begin;
1270
549k
  if (Kind == tok::utf8_char_constant)
1271
148
    ++begin;
1272
1273
  // Skip over the entry quote.
1274
549k
  assert(begin[0] == '\'' && "Invalid token lexed");
1275
0
  ++begin;
1276
1277
  // Remove an optional ud-suffix.
1278
549k
  if (end[-1] != '\'') {
1279
59
    const char *UDSuffixEnd = end;
1280
216
    do {
1281
216
      --end;
1282
216
    } while (end[-1] != '\'');
1283
    // FIXME: Don't bother with this if !tok.hasUCN().
1284
59
    expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1285
59
    UDSuffixOffset = end - TokBegin;
1286
59
  }
1287
1288
  // Trim the ending quote.
1289
549k
  assert(end != begin && "Invalid token lexed");
1290
0
  --end;
1291
1292
  // FIXME: The "Value" is an uint64_t so we can handle char literals of
1293
  // up to 64-bits.
1294
  // FIXME: This extensively assumes that 'char' is 8-bits.
1295
549k
  assert(PP.getTargetInfo().getCharWidth() == 8 &&
1296
549k
         "Assumes char is 8 bits");
1297
0
  assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1298
549k
         (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1299
549k
         "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1300
0
  assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1301
549k
         "Assumes sizeof(wchar) on target is <= 64");
1302
1303
0
  SmallVector<uint32_t, 4> codepoint_buffer;
1304
549k
  codepoint_buffer.resize(end - begin);
1305
549k
  uint32_t *buffer_begin = &codepoint_buffer.front();
1306
549k
  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1307
1308
  // Unicode escapes representing characters that cannot be correctly
1309
  // represented in a single code unit are disallowed in character literals
1310
  // by this implementation.
1311
549k
  uint32_t largest_character_for_kind;
1312
549k
  if (tok::wide_char_constant == Kind) {
1313
1.08k
    largest_character_for_kind =
1314
1.08k
        0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1315
547k
  } else if (tok::utf8_char_constant == Kind) {
1316
148
    largest_character_for_kind = 0x7F;
1317
547k
  } else if (tok::utf16_char_constant == Kind) {
1318
91
    largest_character_for_kind = 0xFFFF;
1319
547k
  } else if (tok::utf32_char_constant == Kind) {
1320
81
    largest_character_for_kind = 0x10FFFF;
1321
547k
  } else {
1322
547k
    largest_character_for_kind = 0x7Fu;
1323
547k
  }
1324
1325
1.09M
  while (begin != end) {
1326
    // Is this a span of non-escape characters?
1327
549k
    if (begin[0] != '\\') {
1328
545k
      char const *start = begin;
1329
2.09M
      do {
1330
2.09M
        ++begin;
1331
2.09M
      } while (begin != end && 
*begin != '\\'1.54M
);
1332
1333
545k
      char const *tmp_in_start = start;
1334
545k
      uint32_t *tmp_out_start = buffer_begin;
1335
545k
      llvm::ConversionResult res =
1336
545k
          llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1337
545k
                             reinterpret_cast<llvm::UTF8 const *>(begin),
1338
545k
                             &buffer_begin, buffer_end, llvm::strictConversion);
1339
545k
      if (res != llvm::conversionOK) {
1340
        // If we see bad encoding for unprefixed character literals, warn and
1341
        // simply copy the byte values, for compatibility with gcc and
1342
        // older versions of clang.
1343
6
        bool NoErrorOnBadEncoding = isAscii();
1344
6
        unsigned Msg = diag::err_bad_character_encoding;
1345
6
        if (NoErrorOnBadEncoding)
1346
3
          Msg = diag::warn_bad_character_encoding;
1347
6
        PP.Diag(Loc, Msg);
1348
6
        if (NoErrorOnBadEncoding) {
1349
3
          start = tmp_in_start;
1350
3
          buffer_begin = tmp_out_start;
1351
7
          for (; start != begin; 
++start, ++buffer_begin4
)
1352
4
            *buffer_begin = static_cast<uint8_t>(*start);
1353
3
        } else {
1354
3
          HadError = true;
1355
3
        }
1356
545k
      } else {
1357
2.63M
        for (; tmp_out_start < buffer_begin; 
++tmp_out_start2.09M
) {
1358
2.09M
          if (*tmp_out_start > largest_character_for_kind) {
1359
13
            HadError = true;
1360
13
            PP.Diag(Loc, diag::err_character_too_large);
1361
13
          }
1362
2.09M
        }
1363
545k
      }
1364
1365
545k
      continue;
1366
545k
    }
1367
    // Is this a Universal Character Name escape?
1368
4.77k
    if (begin[1] == 'u' || 
begin[1] == 'U'4.68k
) {
1369
95
      unsigned short UcnLen = 0;
1370
95
      if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1371
95
                            FullSourceLoc(Loc, PP.getSourceManager()),
1372
95
                            &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1373
29
        HadError = true;
1374
66
      } else if (*buffer_begin > largest_character_for_kind) {
1375
7
        HadError = true;
1376
7
        PP.Diag(Loc, diag::err_character_too_large);
1377
7
      }
1378
1379
95
      ++buffer_begin;
1380
95
      continue;
1381
95
    }
1382
4.67k
    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1383
4.67k
    uint64_t result =
1384
4.67k
      ProcessCharEscape(TokBegin, begin, end, HadError,
1385
4.67k
                        FullSourceLoc(Loc,PP.getSourceManager()),
1386
4.67k
                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1387
4.67k
    *buffer_begin++ = result;
1388
4.67k
  }
1389
1390
549k
  unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1391
1392
549k
  if (NumCharsSoFar > 1) {
1393
515k
    if (isAscii() && 
NumCharsSoFar == 4515k
)
1394
515k
      PP.Diag(Loc, diag::warn_four_char_character_literal);
1395
37
    else if (isAscii())
1396
28
      PP.Diag(Loc, diag::warn_multichar_character_literal);
1397
9
    else {
1398
9
      PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 
04
:
15
);
1399
9
      HadError = true;
1400
9
    }
1401
515k
    IsMultiChar = true;
1402
515k
  } else {
1403
33.6k
    IsMultiChar = false;
1404
33.6k
  }
1405
1406
549k
  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1407
1408
  // Narrow character literals act as though their value is concatenated
1409
  // in this implementation, but warn on overflow.
1410
549k
  bool multi_char_too_long = false;
1411
549k
  if (isAscii() && 
isMultiChar()547k
) {
1412
515k
    LitVal = 0;
1413
2.57M
    for (size_t i = 0; i < NumCharsSoFar; 
++i2.06M
) {
1414
      // check for enough leading zeros to shift into
1415
2.06M
      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1416
2.06M
      LitVal <<= 8;
1417
2.06M
      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1418
2.06M
    }
1419
515k
  } else 
if (33.6k
NumCharsSoFar > 033.6k
) {
1420
    // otherwise just take the last character
1421
33.6k
    LitVal = buffer_begin[-1];
1422
33.6k
  }
1423
1424
549k
  if (!HadError && 
multi_char_too_long548k
) {
1425
3
    PP.Diag(Loc, diag::warn_char_constant_too_large);
1426
3
  }
1427
1428
  // Transfer the value from APInt to uint64_t
1429
549k
  Value = LitVal.getZExtValue();
1430
1431
  // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1432
  // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1433
  // character constants are not sign extended in the this implementation:
1434
  // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1435
549k
  if (isAscii() && 
NumCharsSoFar == 1547k
&&
(Value & 128)32.2k
&&
1436
549k
      
PP.getLangOpts().CharIsSigned101
)
1437
79
    Value = (signed char)Value;
1438
549k
}
1439
1440
/// \verbatim
1441
///       string-literal: [C++0x lex.string]
1442
///         encoding-prefix " [s-char-sequence] "
1443
///         encoding-prefix R raw-string
1444
///       encoding-prefix:
1445
///         u8
1446
///         u
1447
///         U
1448
///         L
1449
///       s-char-sequence:
1450
///         s-char
1451
///         s-char-sequence s-char
1452
///       s-char:
1453
///         any member of the source character set except the double-quote ",
1454
///           backslash \, or new-line character
1455
///         escape-sequence
1456
///         universal-character-name
1457
///       raw-string:
1458
///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1459
///       r-char-sequence:
1460
///         r-char
1461
///         r-char-sequence r-char
1462
///       r-char:
1463
///         any member of the source character set, except a right parenthesis )
1464
///           followed by the initial d-char-sequence (which may be empty)
1465
///           followed by a double quote ".
1466
///       d-char-sequence:
1467
///         d-char
1468
///         d-char-sequence d-char
1469
///       d-char:
1470
///         any member of the basic source character set except:
1471
///           space, the left parenthesis (, the right parenthesis ),
1472
///           the backslash \, and the control characters representing horizontal
1473
///           tab, vertical tab, form feed, and newline.
1474
///       escape-sequence: [C++0x lex.ccon]
1475
///         simple-escape-sequence
1476
///         octal-escape-sequence
1477
///         hexadecimal-escape-sequence
1478
///       simple-escape-sequence:
1479
///         one of \' \" \? \\ \a \b \f \n \r \t \v
1480
///       octal-escape-sequence:
1481
///         \ octal-digit
1482
///         \ octal-digit octal-digit
1483
///         \ octal-digit octal-digit octal-digit
1484
///       hexadecimal-escape-sequence:
1485
///         \x hexadecimal-digit
1486
///         hexadecimal-escape-sequence hexadecimal-digit
1487
///       universal-character-name:
1488
///         \u hex-quad
1489
///         \U hex-quad hex-quad
1490
///       hex-quad:
1491
///         hex-digit hex-digit hex-digit hex-digit
1492
/// \endverbatim
1493
///
1494
StringLiteralParser::
1495
StringLiteralParser(ArrayRef<Token> StringToks,
1496
                    Preprocessor &PP, bool Complain)
1497
  : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1498
    Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1499
    MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1500
4.18M
    ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1501
4.18M
  init(StringToks);
1502
4.18M
}
1503
1504
5.60M
void StringLiteralParser::init(ArrayRef<Token> StringToks){
1505
  // The literal token may have come from an invalid source location (e.g. due
1506
  // to a PCH error), in which case the token length will be 0.
1507
5.60M
  if (StringToks.empty() || StringToks[0].getLength() < 2)
1508
0
    return DiagnoseLexingError(SourceLocation());
1509
1510
  // Scan all of the string portions, remember the max individual token length,
1511
  // computing a bound on the concatenated string length, and see whether any
1512
  // piece is a wide-string.  If any of the string portions is a wide-string
1513
  // literal, the result is a wide-string literal [C99 6.4.5p4].
1514
5.60M
  assert(!StringToks.empty() && "expected at least one token");
1515
0
  MaxTokenLength = StringToks[0].getLength();
1516
5.60M
  assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1517
0
  SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1518
5.60M
  Kind = StringToks[0].getKind();
1519
1520
5.60M
  hadError = false;
1521
1522
  // Implement Translation Phase #6: concatenation of string literals
1523
  /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1524
5.78M
  for (unsigned i = 1; i != StringToks.size(); 
++i173k
) {
1525
173k
    if (StringToks[i].getLength() < 2)
1526
0
      return DiagnoseLexingError(StringToks[i].getLocation());
1527
1528
    // The string could be shorter than this if it needs cleaning, but this is a
1529
    // reasonable bound, which is all we need.
1530
173k
    assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1531
0
    SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1532
1533
    // Remember maximum string piece length.
1534
173k
    if (StringToks[i].getLength() > MaxTokenLength)
1535
143k
      MaxTokenLength = StringToks[i].getLength();
1536
1537
    // Remember if we see any wide or utf-8/16/32 strings.
1538
    // Also check for illegal concatenations.
1539
173k
    if (StringToks[i].isNot(Kind) && 
StringToks[i].isNot(tok::string_literal)83
) {
1540
56
      if (isAscii()) {
1541
20
        Kind = StringToks[i].getKind();
1542
36
      } else {
1543
36
        if (Diags)
1544
36
          Diags->Report(StringToks[i].getLocation(),
1545
36
                        diag::err_unsupported_string_concat);
1546
36
        hadError = true;
1547
36
      }
1548
56
    }
1549
173k
  }
1550
1551
  // Include space for the null terminator.
1552
5.60M
  ++SizeBound;
1553
1554
  // TODO: K&R warning: "traditional C rejects string constant concatenation"
1555
1556
  // Get the width in bytes of char/wchar_t/char16_t/char32_t
1557
5.60M
  CharByteWidth = getCharWidth(Kind, Target);
1558
5.60M
  assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1559
0
  CharByteWidth /= 8;
1560
1561
  // The output buffer size needs to be large enough to hold wide characters.
1562
  // This is a worst-case assumption which basically corresponds to L"" "long".
1563
5.60M
  SizeBound *= CharByteWidth;
1564
1565
  // Size the temporary buffer to hold the result string data.
1566
5.60M
  ResultBuf.resize(SizeBound);
1567
1568
  // Likewise, but for each string piece.
1569
5.60M
  SmallString<512> TokenBuf;
1570
5.60M
  TokenBuf.resize(MaxTokenLength);
1571
1572
  // Loop over all the strings, getting their spelling, and expanding them to
1573
  // wide strings as appropriate.
1574
5.60M
  ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1575
1576
5.60M
  Pascal = false;
1577
1578
5.60M
  SourceLocation UDSuffixTokLoc;
1579
1580
11.3M
  for (unsigned i = 0, e = StringToks.size(); i != e; 
++i5.78M
) {
1581
5.78M
    const char *ThisTokBuf = &TokenBuf[0];
1582
    // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1583
    // that ThisTokBuf points to a buffer that is big enough for the whole token
1584
    // and 'spelled' tokens can only shrink.
1585
5.78M
    bool StringInvalid = false;
1586
5.78M
    unsigned ThisTokLen =
1587
5.78M
      Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1588
5.78M
                         &StringInvalid);
1589
5.78M
    if (StringInvalid)
1590
0
      return DiagnoseLexingError(StringToks[i].getLocation());
1591
1592
5.78M
    const char *ThisTokBegin = ThisTokBuf;
1593
5.78M
    const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1594
1595
    // Remove an optional ud-suffix.
1596
5.78M
    if (ThisTokEnd[-1] != '"') {
1597
475
      const char *UDSuffixEnd = ThisTokEnd;
1598
1.35k
      do {
1599
1.35k
        --ThisTokEnd;
1600
1.35k
      } while (ThisTokEnd[-1] != '"');
1601
1602
475
      StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1603
1604
475
      if (UDSuffixBuf.empty()) {
1605
457
        if (StringToks[i].hasUCN())
1606
8
          expandUCNs(UDSuffixBuf, UDSuffix);
1607
449
        else
1608
449
          UDSuffixBuf.assign(UDSuffix);
1609
457
        UDSuffixToken = i;
1610
457
        UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1611
457
        UDSuffixTokLoc = StringToks[i].getLocation();
1612
457
      } else {
1613
18
        SmallString<32> ExpandedUDSuffix;
1614
18
        if (StringToks[i].hasUCN()) {
1615
9
          expandUCNs(ExpandedUDSuffix, UDSuffix);
1616
9
          UDSuffix = ExpandedUDSuffix;
1617
9
        }
1618
1619
        // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1620
        // result of a concatenation involving at least one user-defined-string-
1621
        // literal, all the participating user-defined-string-literals shall
1622
        // have the same ud-suffix.
1623
18
        if (UDSuffixBuf != UDSuffix) {
1624
6
          if (Diags) {
1625
6
            SourceLocation TokLoc = StringToks[i].getLocation();
1626
6
            Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1627
6
              << UDSuffixBuf << UDSuffix
1628
6
              << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1629
6
              << SourceRange(TokLoc, TokLoc);
1630
6
          }
1631
6
          hadError = true;
1632
6
        }
1633
18
      }
1634
475
    }
1635
1636
    // Strip the end quote.
1637
5.78M
    --ThisTokEnd;
1638
1639
    // TODO: Input character set mapping support.
1640
1641
    // Skip marker for wide or unicode strings.
1642
5.78M
    if (ThisTokBuf[0] == 'L' || 
ThisTokBuf[0] == 'u'5.78M
||
ThisTokBuf[0] == 'U'5.77M
) {
1643
1.74k
      ++ThisTokBuf;
1644
      // Skip 8 of u8 marker for utf8 strings.
1645
1.74k
      if (ThisTokBuf[0] == '8')
1646
312
        ++ThisTokBuf;
1647
1.74k
    }
1648
1649
    // Check for raw string
1650
5.78M
    if (ThisTokBuf[0] == 'R') {
1651
132
      if (ThisTokBuf[1] != '"') {
1652
        // The file may have come from PCH and then changed after loading the
1653
        // PCH; Fail gracefully.
1654
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1655
0
      }
1656
132
      ThisTokBuf += 2; // skip R"
1657
1658
      // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
1659
      // characters.
1660
132
      constexpr unsigned MaxRawStrDelimLen = 16;
1661
1662
132
      const char *Prefix = ThisTokBuf;
1663
347
      while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
1664
347
             ThisTokBuf[0] != '(')
1665
215
        ++ThisTokBuf;
1666
132
      if (ThisTokBuf[0] != '(')
1667
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1668
132
      ++ThisTokBuf; // skip '('
1669
1670
      // Remove same number of characters from the end
1671
132
      ThisTokEnd -= ThisTokBuf - Prefix;
1672
132
      if (ThisTokEnd < ThisTokBuf)
1673
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1674
1675
      // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1676
      // results in a new-line in the resulting execution string-literal.
1677
132
      StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1678
261
      while (!RemainingTokenSpan.empty()) {
1679
        // Split the string literal on \r\n boundaries.
1680
129
        size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1681
129
        StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1682
129
        StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1683
1684
        // Copy everything before the \r\n sequence into the string literal.
1685
129
        if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1686
6
          hadError = true;
1687
1688
        // Point into the \n inside the \r\n sequence and operate on the
1689
        // remaining portion of the literal.
1690
129
        RemainingTokenSpan = AfterCRLF.substr(1);
1691
129
      }
1692
5.78M
    } else {
1693
5.78M
      if (ThisTokBuf[0] != '"') {
1694
        // The file may have come from PCH and then changed after loading the
1695
        // PCH; Fail gracefully.
1696
0
        return DiagnoseLexingError(StringToks[i].getLocation());
1697
0
      }
1698
5.78M
      ++ThisTokBuf; // skip "
1699
1700
      // Check if this is a pascal string
1701
5.78M
      if (Features.PascalStrings && 
ThisTokBuf + 1 != ThisTokEnd98
&&
1702
5.78M
          
ThisTokBuf[0] == '\\'98
&&
ThisTokBuf[1] == 'p'15
) {
1703
1704
        // If the \p sequence is found in the first token, we have a pascal string
1705
        // Otherwise, if we already have a pascal string, ignore the first \p
1706
15
        if (i == 0) {
1707
15
          ++ThisTokBuf;
1708
15
          Pascal = true;
1709
15
        } else 
if (0
Pascal0
)
1710
0
          ThisTokBuf += 2;
1711
15
      }
1712
1713
11.5M
      while (ThisTokBuf != ThisTokEnd) {
1714
        // Is this a span of non-escape characters?
1715
5.80M
        if (ThisTokBuf[0] != '\\') {
1716
5.70M
          const char *InStart = ThisTokBuf;
1717
78.8M
          do {
1718
78.8M
            ++ThisTokBuf;
1719
78.8M
          } while (ThisTokBuf != ThisTokEnd && 
ThisTokBuf[0] != '\\'73.2M
);
1720
1721
          // Copy the character span over.
1722
5.70M
          if (CopyStringFragment(StringToks[i], ThisTokBegin,
1723
5.70M
                                 StringRef(InStart, ThisTokBuf - InStart)))
1724
8
            hadError = true;
1725
5.70M
          continue;
1726
5.70M
        }
1727
        // Is this a Universal Character Name escape?
1728
99.7k
        if (ThisTokBuf[1] == 'u' || 
ThisTokBuf[1] == 'U'99.4k
) {
1729
289
          EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1730
289
                          ResultPtr, hadError,
1731
289
                          FullSourceLoc(StringToks[i].getLocation(), SM),
1732
289
                          CharByteWidth, Diags, Features);
1733
289
          continue;
1734
289
        }
1735
        // Otherwise, this is a non-UCN escape character.  Process it.
1736
99.4k
        unsigned ResultChar =
1737
99.4k
          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1738
99.4k
                            FullSourceLoc(StringToks[i].getLocation(), SM),
1739
99.4k
                            CharByteWidth*8, Diags, Features);
1740
1741
99.4k
        if (CharByteWidth == 4) {
1742
          // FIXME: Make the type of the result buffer correct instead of
1743
          // using reinterpret_cast.
1744
240
          llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
1745
240
          *ResultWidePtr = ResultChar;
1746
240
          ResultPtr += 4;
1747
99.1k
        } else if (CharByteWidth == 2) {
1748
          // FIXME: Make the type of the result buffer correct instead of
1749
          // using reinterpret_cast.
1750
31
          llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
1751
31
          *ResultWidePtr = ResultChar & 0xFFFF;
1752
31
          ResultPtr += 2;
1753
99.1k
        } else {
1754
99.1k
          assert(CharByteWidth == 1 && "Unexpected char width");
1755
0
          *ResultPtr++ = ResultChar & 0xFF;
1756
99.1k
        }
1757
99.4k
      }
1758
5.78M
    }
1759
5.78M
  }
1760
1761
5.60M
  if (Pascal) {
1762
15
    if (CharByteWidth == 4) {
1763
      // FIXME: Make the type of the result buffer correct instead of
1764
      // using reinterpret_cast.
1765
1
      llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
1766
1
      ResultWidePtr[0] = GetNumStringChars() - 1;
1767
14
    } else if (CharByteWidth == 2) {
1768
      // FIXME: Make the type of the result buffer correct instead of
1769
      // using reinterpret_cast.
1770
3
      llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
1771
3
      ResultWidePtr[0] = GetNumStringChars() - 1;
1772
11
    } else {
1773
11
      assert(CharByteWidth == 1 && "Unexpected char width");
1774
0
      ResultBuf[0] = GetNumStringChars() - 1;
1775
11
    }
1776
1777
    // Verify that pascal strings aren't too large.
1778
15
    if (GetStringLength() > 256) {
1779
0
      if (Diags)
1780
0
        Diags->Report(StringToks.front().getLocation(),
1781
0
                      diag::err_pascal_string_too_long)
1782
0
          << SourceRange(StringToks.front().getLocation(),
1783
0
                         StringToks.back().getLocation());
1784
0
      hadError = true;
1785
0
      return;
1786
0
    }
1787
5.60M
  } else if (Diags) {
1788
    // Complain if this string literal has too many characters.
1789
4.18M
    unsigned MaxChars = Features.CPlusPlus? 
655362.16M
:
Features.C992.01M
?
40952.01M
:
5093.22k
;
1790
1791
4.18M
    if (GetNumStringChars() > MaxChars)
1792
1
      Diags->Report(StringToks.front().getLocation(),
1793
1
                    diag::ext_string_too_long)
1794
1
        << GetNumStringChars() << MaxChars
1795
1
        << (Features.CPlusPlus ? 
20
: Features.C99 ?
10
: 0)
1796
1
        << SourceRange(StringToks.front().getLocation(),
1797
1
                       StringToks.back().getLocation());
1798
4.18M
  }
1799
5.60M
}
1800
1801
212
static const char *resyncUTF8(const char *Err, const char *End) {
1802
212
  if (Err == End)
1803
0
    return End;
1804
212
  End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
1805
239
  while (++Err != End && 
(*Err & 0xC0) == 0x80186
)
1806
27
    ;
1807
212
  return Err;
1808
212
}
1809
1810
/// This function copies from Fragment, which is a sequence of bytes
1811
/// within Tok's contents (which begin at TokBegin) into ResultPtr.
1812
/// Performs widening for multi-byte characters.
1813
bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1814
                                             const char *TokBegin,
1815
5.70M
                                             StringRef Fragment) {
1816
5.70M
  const llvm::UTF8 *ErrorPtrTmp;
1817
5.70M
  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1818
5.70M
    return false;
1819
1820
  // If we see bad encoding for unprefixed string literals, warn and
1821
  // simply copy the byte values, for compatibility with gcc and older
1822
  // versions of clang.
1823
37
  bool NoErrorOnBadEncoding = isAscii();
1824
37
  if (NoErrorOnBadEncoding) {
1825
20
    memcpy(ResultPtr, Fragment.data(), Fragment.size());
1826
20
    ResultPtr += Fragment.size();
1827
20
  }
1828
1829
37
  if (Diags) {
1830
34
    const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1831
1832
34
    FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1833
34
    const DiagnosticBuilder &Builder =
1834
34
      Diag(Diags, Features, SourceLoc, TokBegin,
1835
34
           ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1836
34
           NoErrorOnBadEncoding ? 
diag::warn_bad_string_encoding20
1837
34
                                : 
diag::err_bad_string_encoding14
);
1838
1839
34
    const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1840
34
    StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1841
1842
    // Decode into a dummy buffer.
1843
34
    SmallString<512> Dummy;
1844
34
    Dummy.reserve(Fragment.size() * CharByteWidth);
1845
34
    char *Ptr = Dummy.data();
1846
1847
178
    while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1848
144
      const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1849
144
      NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1850
144
      Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1851
144
                                     ErrorPtr, NextStart);
1852
144
      NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1853
144
    }
1854
34
  }
1855
37
  return !NoErrorOnBadEncoding;
1856
5.70M
}
1857
1858
0
void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1859
0
  hadError = true;
1860
0
  if (Diags)
1861
0
    Diags->Report(Loc, diag::err_lexing_string);
1862
0
}
1863
1864
/// getOffsetOfStringByte - This function returns the offset of the
1865
/// specified byte of the string data represented by Token.  This handles
1866
/// advancing over escape sequences in the string.
1867
unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1868
40.7k
                                                    unsigned ByteNo) const {
1869
  // Get the spelling of the token.
1870
40.7k
  SmallString<32> SpellingBuffer;
1871
40.7k
  SpellingBuffer.resize(Tok.getLength());
1872
1873
40.7k
  bool StringInvalid = false;
1874
40.7k
  const char *SpellingPtr = &SpellingBuffer[0];
1875
40.7k
  unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1876
40.7k
                                       &StringInvalid);
1877
40.7k
  if (StringInvalid)
1878
0
    return 0;
1879
1880
40.7k
  const char *SpellingStart = SpellingPtr;
1881
40.7k
  const char *SpellingEnd = SpellingPtr+TokLen;
1882
1883
  // Handle UTF-8 strings just like narrow strings.
1884
40.7k
  if (SpellingPtr[0] == 'u' && 
SpellingPtr[1] == '8'3
)
1885
3
    SpellingPtr += 2;
1886
1887
40.7k
  assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1888
40.7k
         SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1889
1890
  // For raw string literals, this is easy.
1891
40.7k
  if (SpellingPtr[0] == 'R') {
1892
6
    assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1893
    // Skip 'R"'.
1894
0
    SpellingPtr += 2;
1895
35
    while (*SpellingPtr != '(') {
1896
29
      ++SpellingPtr;
1897
29
      assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1898
29
    }
1899
    // Skip '('.
1900
6
    ++SpellingPtr;
1901
6
    return SpellingPtr - SpellingStart + ByteNo;
1902
6
  }
1903
1904
  // Skip over the leading quote
1905
40.7k
  assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1906
0
  ++SpellingPtr;
1907
1908
  // Skip over bytes until we find the offset we're looking for.
1909
727k
  while (ByteNo) {
1910
686k
    assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1911
1912
    // Step over non-escapes simply.
1913
686k
    if (*SpellingPtr != '\\') {
1914
685k
      ++SpellingPtr;
1915
685k
      --ByteNo;
1916
685k
      continue;
1917
685k
    }
1918
1919
    // Otherwise, this is an escape character.  Advance over it.
1920
964
    bool HadError = false;
1921
964
    if (SpellingPtr[1] == 'u' || 
SpellingPtr[1] == 'U'961
) {
1922
6
      const char *EscapePtr = SpellingPtr;
1923
6
      unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1924
6
                                      1, Features, HadError);
1925
6
      if (Len > ByteNo) {
1926
        // ByteNo is somewhere within the escape sequence.
1927
6
        SpellingPtr = EscapePtr;
1928
6
        break;
1929
6
      }
1930
0
      ByteNo -= Len;
1931
958
    } else {
1932
958
      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1933
958
                        FullSourceLoc(Tok.getLocation(), SM),
1934
958
                        CharByteWidth*8, Diags, Features);
1935
958
      --ByteNo;
1936
958
    }
1937
958
    assert(!HadError && "This method isn't valid on erroneous strings");
1938
958
  }
1939
1940
40.7k
  return SpellingPtr-SpellingStart;
1941
40.7k
}
1942
1943
/// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1944
/// suffixes as ud-suffixes, because the diagnostic experience is better if we
1945
/// treat it as an invalid suffix.
1946
bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1947
546
                                          StringRef Suffix) {
1948
546
  return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
1949
546
         
Suffix == "sv"82
;
1950
546
}