Coverage Report

Created: 2020-02-18 08:44

/Users/buildslave/jenkins/workspace/coverage/llvm-project/clang/lib/Format/FormatTokenLexer.cpp
Line
Count
Source (jump to first uncovered line)
1
//===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
///
9
/// \file
10
/// This file implements FormatTokenLexer, which tokenizes a source file
11
/// into a FormatToken stream suitable for ClangFormat.
12
///
13
//===----------------------------------------------------------------------===//
14
15
#include "FormatTokenLexer.h"
16
#include "FormatToken.h"
17
#include "clang/Basic/SourceLocation.h"
18
#include "clang/Basic/SourceManager.h"
19
#include "clang/Format/Format.h"
20
#include "llvm/Support/Regex.h"
21
22
namespace clang {
23
namespace format {
24
25
FormatTokenLexer::FormatTokenLexer(const SourceManager &SourceMgr, FileID ID,
26
                                   unsigned Column, const FormatStyle &Style,
27
                                   encoding::Encoding Encoding)
28
    : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
29
      Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
30
      Style(Style), IdentTable(getFormattingLangOpts(Style)),
31
      Keywords(IdentTable), Encoding(Encoding), FirstInLineIndex(0),
32
      FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
33
32.7k
      MacroBlockEndRegex(Style.MacroBlockEnd) {
34
32.7k
  Lex.reset(new Lexer(ID, SourceMgr.getBuffer(ID), SourceMgr,
35
32.7k
                      getFormattingLangOpts(Style)));
36
32.7k
  Lex->SetKeepWhitespaceMode(true);
37
32.7k
38
32.7k
  for (const std::string &ForEachMacro : Style.ForEachMacros)
39
98.3k
    Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
40
32.7k
  for (const std::string &StatementMacro : Style.StatementMacros)
41
65.6k
    Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
42
32.7k
  for (const std::string &TypenameMacro : Style.TypenameMacros)
43
252
    Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
44
32.7k
  for (const std::string &NamespaceMacro : Style.NamespaceMacros)
45
159
    Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
46
32.7k
}
47
48
32.7k
ArrayRef<FormatToken *> FormatTokenLexer::lex() {
49
32.7k
  assert(Tokens.empty());
50
32.7k
  assert(FirstInLineIndex == 0);
51
563k
  do {
52
563k
    Tokens.push_back(getNextToken());
53
563k
    if (Style.Language == FormatStyle::LK_JavaScript) {
54
33.5k
      tryParseJSRegexLiteral();
55
33.5k
      handleTemplateStrings();
56
33.5k
    }
57
563k
    if (Style.Language == FormatStyle::LK_TextProto)
58
6.72k
      tryParsePythonComment();
59
563k
    tryMergePreviousTokens();
60
563k
    if (Style.isCSharp())
61
2.36k
      // This needs to come after tokens have been merged so that C#
62
2.36k
      // string literals are correctly identified.
63
2.36k
      handleCSharpVerbatimAndInterpolatedStrings();
64
563k
    if (Tokens.back()->NewlinesBefore > 0 || 
Tokens.back()->IsMultiline507k
)
65
56.4k
      FirstInLineIndex = Tokens.size() - 1;
66
563k
  } while (Tokens.back()->Tok.isNot(tok::eof));
67
32.7k
  return Tokens;
68
32.7k
}
69
70
563k
void FormatTokenLexer::tryMergePreviousTokens() {
71
563k
  if (tryMerge_TMacro())
72
15
    return;
73
563k
  if (tryMergeConflictMarkers())
74
36
    return;
75
563k
  if (tryMergeLessLess())
76
982
    return;
77
562k
78
562k
  if (Style.isCSharp()) {
79
2.36k
    if (tryMergeCSharpAttributeAndTarget())
80
2
      return;
81
2.36k
    if (tryMergeCSharpKeywordVariables())
82
2
      return;
83
2.36k
    if (tryMergeCSharpStringLiteral())
84
37
      return;
85
2.32k
    if (tryMergeCSharpDoubleQuestion())
86
0
      return;
87
2.32k
    if (tryMergeCSharpNullConditionals())
88
20
      return;
89
2.30k
    if (tryTransformCSharpForEach())
90
4
      return;
91
2.30k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
92
2.30k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
93
4
      return;
94
562k
  }
95
562k
96
562k
  if (tryMergeNSStringLiteral())
97
222
    return;
98
562k
99
562k
  if (Style.Language == FormatStyle::LK_JavaScript) {
100
33.5k
    static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
101
33.5k
    static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
102
33.5k
                                                   tok::equal};
103
33.5k
    static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
104
33.5k
                                                  tok::greaterequal};
105
33.5k
    static const tok::TokenKind JSRightArrow[] = {tok::equal, tok::greater};
106
33.5k
    static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
107
33.5k
    static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
108
33.5k
                                                           tok::starequal};
109
33.5k
    static const tok::TokenKind JSNullPropagatingOperator[] = {tok::question,
110
33.5k
                                                               tok::period};
111
33.5k
    static const tok::TokenKind JSNullishOperator[] = {tok::question,
112
33.5k
                                                       tok::question};
113
33.5k
114
33.5k
    // FIXME: Investigate what token type gives the correct operator priority.
115
33.5k
    if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
116
12
      return;
117
33.5k
    if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
118
12
      return;
119
33.5k
    if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
120
10
      return;
121
33.5k
    if (tryMergeTokens(JSRightArrow, TT_JsFatArrow))
122
148
      return;
123
33.3k
    if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
124
4
      return;
125
33.3k
    if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
126
4
      Tokens.back()->Tok.setKind(tok::starequal);
127
4
      return;
128
4
    }
129
33.3k
    if (tryMergeTokens(JSNullishOperator, TT_JsNullishCoalescingOperator)) {
130
16
      // Treat like the "||" operator (as opposed to the ternary ?).
131
16
      Tokens.back()->Tok.setKind(tok::pipepipe);
132
16
      return;
133
16
    }
134
33.3k
    if (tryMergeTokens(JSNullPropagatingOperator,
135
33.3k
                       TT_JsNullPropagatingOperator)) {
136
16
      // Treat like a regular "." access.
137
16
      Tokens.back()->Tok.setKind(tok::period);
138
16
      return;
139
16
    }
140
33.3k
    if (tryMergeJSPrivateIdentifier())
141
32
      return;
142
562k
  }
143
562k
144
562k
  if (Style.Language == FormatStyle::LK_Java) {
145
4.35k
    static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
146
4.35k
        tok::greater, tok::greater, tok::greaterequal};
147
4.35k
    if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
148
2
      return;
149
4.35k
  }
150
562k
}
151
152
562k
bool FormatTokenLexer::tryMergeNSStringLiteral() {
153
562k
  if (Tokens.size() < 2)
154
32.7k
    return false;
155
529k
  auto &At = *(Tokens.end() - 2);
156
529k
  auto &String = *(Tokens.end() - 1);
157
529k
  if (!At->is(tok::at) || 
!String->is(tok::string_literal)1.26k
)
158
529k
    return false;
159
222
  At->Tok.setKind(tok::string_literal);
160
222
  At->TokenText = StringRef(At->TokenText.begin(),
161
222
                            String->TokenText.end() - At->TokenText.begin());
162
222
  At->ColumnWidth += String->ColumnWidth;
163
222
  At->Type = TT_ObjCStringLiteral;
164
222
  Tokens.erase(Tokens.end() - 1);
165
222
  return true;
166
222
}
167
168
33.3k
bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
169
33.3k
  // Merges #idenfier into a single identifier with the text #identifier
170
33.3k
  // but the token tok::identifier.
171
33.3k
  if (Tokens.size() < 2)
172
2.57k
    return false;
173
30.7k
  auto &Hash = *(Tokens.end() - 2);
174
30.7k
  auto &Identifier = *(Tokens.end() - 1);
175
30.7k
  if (!Hash->is(tok::hash) || 
!Identifier->is(tok::identifier)36
)
176
30.7k
    return false;
177
32
  Hash->Tok.setKind(tok::identifier);
178
32
  Hash->TokenText =
179
32
      StringRef(Hash->TokenText.begin(),
180
32
                Identifier->TokenText.end() - Hash->TokenText.begin());
181
32
  Hash->ColumnWidth += Identifier->ColumnWidth;
182
32
  Hash->Type = TT_JsPrivateIdentifier;
183
32
  Tokens.erase(Tokens.end() - 1);
184
32
  return true;
185
32
}
186
187
// Search for verbatim or interpolated string literals @"ABC" or
188
// $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
189
// prevent splitting of @, $ and ".
190
// Merging of multiline verbatim strings with embedded '"' is handled in
191
// handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
192
2.36k
bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
193
2.36k
  if (Tokens.size() < 2)
194
181
    return false;
195
2.18k
196
2.18k
  // Interpolated strings could contain { } with " characters inside.
197
2.18k
  // $"{x ?? "null"}"
198
2.18k
  // should not be split into $"{x ?? ", null, "}" but should treated as a
199
2.18k
  // single string-literal.
200
2.18k
  //
201
2.18k
  // We opt not to try and format expressions inside {} within a C#
202
2.18k
  // interpolated string. Formatting expressions within an interpolated string
203
2.18k
  // would require similar work as that done for JavaScript template strings
204
2.18k
  // in `handleTemplateStrings()`.
205
2.18k
  auto &CSharpInterpolatedString = *(Tokens.end() - 2);
206
2.18k
  if (CSharpInterpolatedString->Type == TT_CSharpStringLiteral &&
207
2.18k
      
(37
CSharpInterpolatedString->TokenText.startswith(R"($")")37
||
208
37
       
CSharpInterpolatedString->TokenText.startswith(R"($@")")12
)) {
209
31
    int UnmatchedOpeningBraceCount = 0;
210
31
211
31
    auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
212
831
    for (size_t Index = 0; Index < TokenTextSize; 
++Index800
) {
213
800
      char C = CSharpInterpolatedString->TokenText[Index];
214
800
      if (C == '{') {
215
43
        // "{{"  inside an interpolated string is an escaped '{' so skip it.
216
43
        if (Index + 1 < TokenTextSize &&
217
43
            CSharpInterpolatedString->TokenText[Index + 1] == '{') {
218
6
          ++Index;
219
6
          continue;
220
6
        }
221
37
        ++UnmatchedOpeningBraceCount;
222
757
      } else if (C == '}') {
223
37
        // "}}"  inside an interpolated string is an escaped '}' so skip it.
224
37
        if (Index + 1 < TokenTextSize &&
225
37
            CSharpInterpolatedString->TokenText[Index + 1] == '}') {
226
6
          ++Index;
227
6
          continue;
228
6
        }
229
31
        --UnmatchedOpeningBraceCount;
230
31
      }
231
800
    }
232
31
233
31
    if (UnmatchedOpeningBraceCount > 0) {
234
6
      auto &NextToken = *(Tokens.end() - 1);
235
6
      CSharpInterpolatedString->TokenText =
236
6
          StringRef(CSharpInterpolatedString->TokenText.begin(),
237
6
                    NextToken->TokenText.end() -
238
6
                        CSharpInterpolatedString->TokenText.begin());
239
6
      CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
240
6
      Tokens.erase(Tokens.end() - 1);
241
6
      return true;
242
6
    }
243
2.17k
  }
244
2.17k
245
2.17k
  // Look for @"aaaaaa" or $"aaaaaa".
246
2.17k
  auto &String = *(Tokens.end() - 1);
247
2.17k
  if (!String->is(tok::string_literal))
248
2.11k
    return false;
249
63
250
63
  auto &At = *(Tokens.end() - 2);
251
63
  if (!(At->is(tok::at) || 
At->TokenText == "$"51
))
252
32
    return false;
253
31
254
31
  if (Tokens.size() > 2 && 
At->is(tok::at)29
) {
255
12
    auto &Dollar = *(Tokens.end() - 3);
256
12
    if (Dollar->TokenText == "$") {
257
6
      // This looks like $@"aaaaa" so we need to combine all 3 tokens.
258
6
      Dollar->Tok.setKind(tok::string_literal);
259
6
      Dollar->TokenText =
260
6
          StringRef(Dollar->TokenText.begin(),
261
6
                    String->TokenText.end() - Dollar->TokenText.begin());
262
6
      Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
263
6
      Dollar->Type = TT_CSharpStringLiteral;
264
6
      Tokens.erase(Tokens.end() - 2);
265
6
      Tokens.erase(Tokens.end() - 1);
266
6
      return true;
267
6
    }
268
25
  }
269
25
270
25
  // Convert back into just a string_literal.
271
25
  At->Tok.setKind(tok::string_literal);
272
25
  At->TokenText = StringRef(At->TokenText.begin(),
273
25
                            String->TokenText.end() - At->TokenText.begin());
274
25
  At->ColumnWidth += String->ColumnWidth;
275
25
  At->Type = TT_CSharpStringLiteral;
276
25
  Tokens.erase(Tokens.end() - 1);
277
25
  return true;
278
25
}
279
280
// Valid C# attribute targets:
281
// https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
282
const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
283
    "assembly", "module",   "field",  "event", "method",
284
    "param",    "property", "return", "type",
285
};
286
287
2.36k
bool FormatTokenLexer::tryMergeCSharpAttributeAndTarget() {
288
2.36k
  // Treat '[assembly:' and '[field:' as tokens in their own right.
289
2.36k
  if (Tokens.size() < 3)
290
366
    return false;
291
2.00k
292
2.00k
  auto &SquareBracket = *(Tokens.end() - 3);
293
2.00k
  auto &Target = *(Tokens.end() - 2);
294
2.00k
  auto &Colon = *(Tokens.end() - 1);
295
2.00k
296
2.00k
  if (!SquareBracket->Tok.is(tok::l_square))
297
1.92k
    return false;
298
76
299
76
  if (CSharpAttributeTargets.find(Target->TokenText) ==
300
76
      CSharpAttributeTargets.end())
301
74
    return false;
302
2
303
2
  if (!Colon->Tok.is(tok::colon))
304
0
    return false;
305
2
306
2
  SquareBracket->TokenText =
307
2
      StringRef(SquareBracket->TokenText.begin(),
308
2
                Colon->TokenText.end() - SquareBracket->TokenText.begin());
309
2
  SquareBracket->ColumnWidth += (Target->ColumnWidth + Colon->ColumnWidth);
310
2
  Tokens.erase(Tokens.end() - 2);
311
2
  Tokens.erase(Tokens.end() - 1);
312
2
  return true;
313
2
}
314
315
2.32k
bool FormatTokenLexer::tryMergeCSharpDoubleQuestion() {
316
2.32k
  if (Tokens.size() < 2)
317
181
    return false;
318
2.14k
  auto &FirstQuestion = *(Tokens.end() - 2);
319
2.14k
  auto &SecondQuestion = *(Tokens.end() - 1);
320
2.14k
  if (!FirstQuestion->is(tok::question) || 
!SecondQuestion->is(tok::question)2
)
321
2.14k
    return false;
322
0
  FirstQuestion->Tok.setKind(tok::question);
323
0
  FirstQuestion->TokenText = StringRef(FirstQuestion->TokenText.begin(),
324
0
                                       SecondQuestion->TokenText.end() -
325
0
                                           FirstQuestion->TokenText.begin());
326
0
  FirstQuestion->ColumnWidth += SecondQuestion->ColumnWidth;
327
0
  FirstQuestion->Type = TT_CSharpNullCoalescing;
328
0
  Tokens.erase(Tokens.end() - 1);
329
0
  return true;
330
0
}
331
332
2.36k
bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
333
2.36k
  if (Tokens.size() < 2)
334
181
    return false;
335
2.18k
  auto &At = *(Tokens.end() - 2);
336
2.18k
  auto &Keyword = *(Tokens.end() - 1);
337
2.18k
  if (!At->is(tok::at))
338
2.16k
    return false;
339
16
  if (!Keywords.isCSharpKeyword(*Keyword))
340
14
    return false;
341
2
342
2
  At->Tok.setKind(tok::identifier);
343
2
  At->TokenText = StringRef(At->TokenText.begin(),
344
2
                            Keyword->TokenText.end() - At->TokenText.begin());
345
2
  At->ColumnWidth += Keyword->ColumnWidth;
346
2
  At->Type = Keyword->Type;
347
2
  Tokens.erase(Tokens.end() - 1);
348
2
  return true;
349
2
}
350
351
// In C# merge the Identifier and the ? together e.g. arg?.
352
2.32k
bool FormatTokenLexer::tryMergeCSharpNullConditionals() {
353
2.32k
  if (Tokens.size() < 2)
354
181
    return false;
355
2.14k
  auto &Identifier = *(Tokens.end() - 2);
356
2.14k
  auto &Question = *(Tokens.end() - 1);
357
2.14k
  if (!Identifier->isOneOf(tok::r_square, tok::identifier) ||
358
2.14k
      
!Question->is(tok::question)727
)
359
2.12k
    return false;
360
20
  Identifier->TokenText =
361
20
      StringRef(Identifier->TokenText.begin(),
362
20
                Question->TokenText.end() - Identifier->TokenText.begin());
363
20
  Identifier->ColumnWidth += Question->ColumnWidth;
364
20
  Tokens.erase(Tokens.end() - 1);
365
20
  return true;
366
20
}
367
368
// In C# transform identifier foreach into kw_foreach
369
2.30k
bool FormatTokenLexer::tryTransformCSharpForEach() {
370
2.30k
  if (Tokens.size() < 1)
371
0
    return false;
372
2.30k
  auto &Identifier = *(Tokens.end() - 1);
373
2.30k
  if (!Identifier->is(tok::identifier))
374
1.64k
    return false;
375
656
  if (Identifier->TokenText != "foreach")
376
652
    return false;
377
4
378
4
  Identifier->Type = TT_ForEachMacro;
379
4
  Identifier->Tok.setKind(tok::kw_for);
380
4
  return true;
381
4
}
382
383
563k
bool FormatTokenLexer::tryMergeLessLess() {
384
563k
  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
385
563k
  if (Tokens.size() < 3)
386
65.3k
    return false;
387
498k
388
498k
  bool FourthTokenIsLess = false;
389
498k
  if (Tokens.size() > 3)
390
466k
    FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
391
498k
392
498k
  auto First = Tokens.end() - 3;
393
498k
  if (First[2]->is(tok::less) || 
First[1]->isNot(tok::less)489k
||
394
498k
      
First[0]->isNot(tok::less)8.98k
||
FourthTokenIsLess1.05k
)
395
497k
    return false;
396
982
397
982
  // Only merge if there currently is no whitespace between the two "<".
398
982
  if (First[1]->WhitespaceRange.getBegin() !=
399
982
      First[1]->WhitespaceRange.getEnd())
400
0
    return false;
401
982
402
982
  First[0]->Tok.setKind(tok::lessless);
403
982
  First[0]->TokenText = "<<";
404
982
  First[0]->ColumnWidth += 1;
405
982
  Tokens.erase(Tokens.end() - 2);
406
982
  return true;
407
982
}
408
409
bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
410
274k
                                      TokenType NewType) {
411
274k
  if (Tokens.size() < Kinds.size())
412
23.8k
    return false;
413
250k
414
250k
  SmallVectorImpl<FormatToken *>::const_iterator First =
415
250k
      Tokens.end() - Kinds.size();
416
250k
  if (!First[0]->is(Kinds[0]))
417
247k
    return false;
418
2.48k
  unsigned AddLength = 0;
419
2.77k
  for (unsigned i = 1; i < Kinds.size(); 
++i290
) {
420
2.54k
    if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
421
302
                                       First[i]->WhitespaceRange.getEnd())
422
2.25k
      return false;
423
290
    AddLength += First[i]->TokenText.size();
424
290
  }
425
2.48k
  Tokens.resize(Tokens.size() - Kinds.size() + 1);
426
228
  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
427
228
                                  First[0]->TokenText.size() + AddLength);
428
228
  First[0]->ColumnWidth += AddLength;
429
228
  First[0]->Type = NewType;
430
228
  return true;
431
2.48k
}
432
433
// Returns \c true if \p Tok can only be followed by an operand in JavaScript.
434
336
bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
435
336
  // NB: This is not entirely correct, as an r_paren can introduce an operand
436
336
  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
437
336
  // corner case to not matter in practice, though.
438
336
  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
439
336
                      tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
440
336
                      tok::colon, tok::question, tok::tilde) ||
441
336
         Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
442
292
                      tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
443
292
                      tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
444
336
         
Tok->isBinaryOperator()288
;
445
336
}
446
447
340
bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
448
340
  if (!Prev)
449
4
    return true;
450
336
451
336
  // Regex literals can only follow after prefix unary operators, not after
452
336
  // postfix unary operators. If the '++' is followed by a non-operand
453
336
  // introducing token, the slash here is the operand and not the start of a
454
336
  // regex.
455
336
  // `!` is an unary prefix operator, but also a post-fix operator that casts
456
336
  // away nullability, so the same check applies.
457
336
  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
458
20
    return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
459
316
460
316
  // The previous token must introduce an operand location where regex
461
316
  // literals can occur.
462
316
  if (!precedesOperand(Prev))
463
24
    return false;
464
292
465
292
  return true;
466
292
}
467
468
// Tries to parse a JavaScript Regex literal starting at the current token,
469
// if that begins with a slash and is in a location where JavaScript allows
470
// regex literals. Changes the current token to a regex literal and updates
471
// its text if successful.
472
33.5k
void FormatTokenLexer::tryParseJSRegexLiteral() {
473
33.5k
  FormatToken *RegexToken = Tokens.back();
474
33.5k
  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
475
33.2k
    return;
476
340
477
340
  FormatToken *Prev = nullptr;
478
348
  for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; 
++I8
) {
479
344
    // NB: Because previous pointers are not initialized yet, this cannot use
480
344
    // Token.getPreviousNonComment.
481
344
    if ((*I)->isNot(tok::comment)) {
482
336
      Prev = *I;
483
336
      break;
484
336
    }
485
344
  }
486
340
487
340
  if (!canPrecedeRegexLiteral(Prev))
488
36
    return;
489
304
490
304
  // 'Manually' lex ahead in the current file buffer.
491
304
  const char *Offset = Lex->getBufferLocation();
492
304
  const char *RegexBegin = Offset - RegexToken->TokenText.size();
493
304
  StringRef Buffer = Lex->getBuffer();
494
304
  bool InCharacterClass = false;
495
304
  bool HaveClosingSlash = false;
496
1.91k
  for (; !HaveClosingSlash && 
Offset != Buffer.end()1.61k
;
++Offset1.61k
) {
497
1.61k
    // Regular expressions are terminated with a '/', which can only be
498
1.61k
    // escaped using '\' or a character class between '[' and ']'.
499
1.61k
    // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
500
1.61k
    switch (*Offset) {
501
116
    case '\\':
502
116
      // Skip the escaped character.
503
116
      ++Offset;
504
116
      break;
505
40
    case '[':
506
40
      InCharacterClass = true;
507
40
      break;
508
40
    case ']':
509
40
      InCharacterClass = false;
510
40
      break;
511
320
    case '/':
512
320
      if (!InCharacterClass)
513
304
        HaveClosingSlash = true;
514
320
      break;
515
1.61k
    }
516
1.61k
  }
517
304
518
304
  RegexToken->Type = TT_RegexLiteral;
519
304
  // Treat regex literals like other string_literals.
520
304
  RegexToken->Tok.setKind(tok::string_literal);
521
304
  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
522
304
  RegexToken->ColumnWidth = RegexToken->TokenText.size();
523
304
524
304
  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
525
304
}
526
527
2.36k
void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
528
2.36k
  FormatToken *CSharpStringLiteral = Tokens.back();
529
2.36k
530
2.36k
  if (CSharpStringLiteral->Type != TT_CSharpStringLiteral)
531
2.32k
    return;
532
37
533
37
  // Deal with multiline strings.
534
37
  if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
535
37
        
CSharpStringLiteral->TokenText.startswith(R"($@")")31
))
536
25
    return;
537
12
538
12
  const char *StrBegin =
539
12
      Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
540
12
  const char *Offset = StrBegin;
541
12
  if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
542
6
    Offset += 2;
543
6
  else // CSharpStringLiteral->TokenText.startswith(R"($@")")
544
6
    Offset += 3;
545
12
546
12
  // Look for a terminating '"' in the current file buffer.
547
12
  // Make no effort to format code within an interpolated or verbatim string.
548
288
  for (; Offset != Lex->getBuffer().end(); 
++Offset276
) {
549
288
    if (Offset[0] == '"') {
550
22
      // "" within a verbatim string is an escaped double quote: skip it.
551
22
      if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
552
10
        ++Offset;
553
12
      else
554
12
        break;
555
22
    }
556
288
  }
557
12
558
12
  // Make no attempt to format code properly if a verbatim string is
559
12
  // unterminated.
560
12
  if (Offset == Lex->getBuffer().end())
561
0
    return;
562
12
563
12
  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
564
12
  CSharpStringLiteral->TokenText = LiteralText;
565
12
566
12
  // Adjust width for potentially multiline string literals.
567
12
  size_t FirstBreak = LiteralText.find('\n');
568
12
  StringRef FirstLineText = FirstBreak == StringRef::npos
569
12
                                ? 
LiteralText10
570
12
                                : 
LiteralText.substr(0, FirstBreak)2
;
571
12
  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
572
12
      FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
573
12
      Encoding);
574
12
  size_t LastBreak = LiteralText.rfind('\n');
575
12
  if (LastBreak != StringRef::npos) {
576
2
    CSharpStringLiteral->IsMultiline = true;
577
2
    unsigned StartColumn = 0; // The template tail spans the entire line.
578
2
    CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
579
2
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
580
2
        Style.TabWidth, Encoding);
581
2
  }
582
12
583
12
  SourceLocation loc = Offset < Lex->getBuffer().end()
584
12
                           ? Lex->getSourceLocation(Offset + 1)
585
12
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
586
12
  resetLexer(SourceMgr.getFileOffset(loc));
587
12
}
588
589
33.5k
void FormatTokenLexer::handleTemplateStrings() {
590
33.5k
  FormatToken *BacktickToken = Tokens.back();
591
33.5k
592
33.5k
  if (BacktickToken->is(tok::l_brace)) {
593
1.85k
    StateStack.push(LexerState::NORMAL);
594
1.85k
    return;
595
1.85k
  }
596
31.6k
  if (BacktickToken->is(tok::r_brace)) {
597
1.95k
    if (StateStack.size() == 1)
598
4
      return;
599
1.94k
    StateStack.pop();
600
1.94k
    if (StateStack.top() != LexerState::TEMPLATE_STRING)
601
1.84k
      return;
602
29.7k
    // If back in TEMPLATE_STRING, fallthrough and continue parsing the
603
29.7k
  } else if (BacktickToken->is(tok::unknown) &&
604
29.7k
             
BacktickToken->TokenText == "`"148
) {
605
148
    StateStack.push(LexerState::TEMPLATE_STRING);
606
29.5k
  } else {
607
29.5k
    return; // Not actually a template
608
29.5k
  }
609
248
610
248
  // 'Manually' lex ahead in the current file buffer.
611
248
  const char *Offset = Lex->getBufferLocation();
612
248
  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
613
1.90k
  for (; Offset != Lex->getBuffer().end(); 
++Offset1.65k
) {
614
1.90k
    if (Offset[0] == '`') {
615
148
      StateStack.pop();
616
148
      break;
617
148
    }
618
1.75k
    if (Offset[0] == '\\') {
619
8
      ++Offset; // Skip the escaped character.
620
1.74k
    } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
621
1.74k
               
Offset[1] == '{'100
) {
622
100
      // '${' introduces an expression interpolation in the template string.
623
100
      StateStack.push(LexerState::NORMAL);
624
100
      ++Offset;
625
100
      break;
626
100
    }
627
1.75k
  }
628
248
629
248
  StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
630
248
  BacktickToken->Type = TT_TemplateString;
631
248
  BacktickToken->Tok.setKind(tok::string_literal);
632
248
  BacktickToken->TokenText = LiteralText;
633
248
634
248
  // Adjust width for potentially multiline string literals.
635
248
  size_t FirstBreak = LiteralText.find('\n');
636
248
  StringRef FirstLineText = FirstBreak == StringRef::npos
637
248
                                ? 
LiteralText212
638
248
                                : 
LiteralText.substr(0, FirstBreak)36
;
639
248
  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
640
248
      FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
641
248
  size_t LastBreak = LiteralText.rfind('\n');
642
248
  if (LastBreak != StringRef::npos) {
643
36
    BacktickToken->IsMultiline = true;
644
36
    unsigned StartColumn = 0; // The template tail spans the entire line.
645
36
    BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
646
36
        LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
647
36
        Style.TabWidth, Encoding);
648
36
  }
649
248
650
248
  SourceLocation loc = Offset < Lex->getBuffer().end()
651
248
                           ? Lex->getSourceLocation(Offset + 1)
652
248
                           : 
SourceMgr.getLocForEndOfFile(ID)0
;
653
248
  resetLexer(SourceMgr.getFileOffset(loc));
654
248
}
655
656
6.72k
void FormatTokenLexer::tryParsePythonComment() {
657
6.72k
  FormatToken *HashToken = Tokens.back();
658
6.72k
  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
659
6.66k
    return;
660
63
  // Turn the remainder of this line into a comment.
661
63
  const char *CommentBegin =
662
63
      Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
663
63
  size_t From = CommentBegin - Lex->getBuffer().begin();
664
63
  size_t To = Lex->getBuffer().find_first_of('\n', From);
665
63
  if (To == StringRef::npos)
666
7
    To = Lex->getBuffer().size();
667
63
  size_t Len = To - From;
668
63
  HashToken->Type = TT_LineComment;
669
63
  HashToken->Tok.setKind(tok::comment);
670
63
  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
671
63
  SourceLocation Loc = To < Lex->getBuffer().size()
672
63
                           ? 
Lex->getSourceLocation(CommentBegin + Len)56
673
63
                           : 
SourceMgr.getLocForEndOfFile(ID)7
;
674
63
  resetLexer(SourceMgr.getFileOffset(Loc));
675
63
}
676
677
563k
bool FormatTokenLexer::tryMerge_TMacro() {
678
563k
  if (Tokens.size() < 4)
679
97.2k
    return false;
680
466k
  FormatToken *Last = Tokens.back();
681
466k
  if (!Last->is(tok::r_paren))
682
424k
    return false;
683
41.6k
684
41.6k
  FormatToken *String = Tokens[Tokens.size() - 2];
685
41.6k
  if (!String->is(tok::string_literal) || 
String->IsMultiline592
)
686
41.1k
    return false;
687
559
688
559
  if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
689
272
    return false;
690
287
691
287
  FormatToken *Macro = Tokens[Tokens.size() - 4];
692
287
  if (Macro->TokenText != "_T")
693
272
    return false;
694
15
695
15
  const char *Start = Macro->TokenText.data();
696
15
  const char *End = Last->TokenText.data() + Last->TokenText.size();
697
15
  String->TokenText = StringRef(Start, End - Start);
698
15
  String->IsFirst = Macro->IsFirst;
699
15
  String->LastNewlineOffset = Macro->LastNewlineOffset;
700
15
  String->WhitespaceRange = Macro->WhitespaceRange;
701
15
  String->OriginalColumn = Macro->OriginalColumn;
702
15
  String->ColumnWidth = encoding::columnWidthWithTabs(
703
15
      String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
704
15
  String->NewlinesBefore = Macro->NewlinesBefore;
705
15
  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
706
15
707
15
  Tokens.pop_back();
708
15
  Tokens.pop_back();
709
15
  Tokens.pop_back();
710
15
  Tokens.back() = String;
711
15
  return true;
712
15
}
713
714
563k
bool FormatTokenLexer::tryMergeConflictMarkers() {
715
563k
  if (Tokens.back()->NewlinesBefore == 0 && 
Tokens.back()->isNot(tok::eof)507k
)
716
477k
    return false;
717
85.7k
718
85.7k
  // Conflict lines look like:
719
85.7k
  // <marker> <text from the vcs>
720
85.7k
  // For example:
721
85.7k
  // >>>>>>> /file/in/file/system at revision 1234
722
85.7k
  //
723
85.7k
  // We merge all tokens in a line that starts with a conflict marker
724
85.7k
  // into a single token with a special token type that the unwrapped line
725
85.7k
  // parser will use to correctly rebuild the underlying code.
726
85.7k
727
85.7k
  FileID ID;
728
85.7k
  // Get the position of the first token in the line.
729
85.7k
  unsigned FirstInLineOffset;
730
85.7k
  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
731
85.7k
      Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
732
85.7k
  StringRef Buffer = SourceMgr.getBuffer(ID)->getBuffer();
733
85.7k
  // Calculate the offset of the start of the current line.
734
85.7k
  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
735
85.7k
  if (LineOffset == StringRef::npos) {
736
31.6k
    LineOffset = 0;
737
54.0k
  } else {
738
54.0k
    ++LineOffset;
739
54.0k
  }
740
85.7k
741
85.7k
  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
742
85.7k
  StringRef LineStart;
743
85.7k
  if (FirstSpace == StringRef::npos) {
744
4.88k
    LineStart = Buffer.substr(LineOffset);
745
80.8k
  } else {
746
80.8k
    LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
747
80.8k
  }
748
85.7k
749
85.7k
  TokenType Type = TT_Unknown;
750
85.7k
  if (LineStart == "<<<<<<<" || 
LineStart == ">>>>"85.7k
) {
751
9
    Type = TT_ConflictStart;
752
85.7k
  } else if (LineStart == "|||||||" || 
LineStart == "======="85.6k
||
753
85.7k
             
LineStart == "===="85.6k
) {
754
18
    Type = TT_ConflictAlternative;
755
85.6k
  } else if (LineStart == ">>>>>>>" || 
LineStart == "<<<<"85.6k
) {
756
9
    Type = TT_ConflictEnd;
757
9
  }
758
85.7k
759
85.7k
  if (Type != TT_Unknown) {
760
36
    FormatToken *Next = Tokens.back();
761
36
762
36
    Tokens.resize(FirstInLineIndex + 1);
763
36
    // We do not need to build a complete token here, as we will skip it
764
36
    // during parsing anyway (as we must not touch whitespace around conflict
765
36
    // markers).
766
36
    Tokens.back()->Type = Type;
767
36
    Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
768
36
769
36
    Tokens.push_back(Next);
770
36
    return true;
771
36
  }
772
85.6k
773
85.6k
  return false;
774
85.6k
}
775
776
1.39k
FormatToken *FormatTokenLexer::getStashedToken() {
777
1.39k
  // Create a synthesized second '>' or '<' token.
778
1.39k
  Token Tok = FormatTok->Tok;
779
1.39k
  StringRef TokenText = FormatTok->TokenText;
780
1.39k
781
1.39k
  unsigned OriginalColumn = FormatTok->OriginalColumn;
782
1.39k
  FormatTok = new (Allocator.Allocate()) FormatToken;
783
1.39k
  FormatTok->Tok = Tok;
784
1.39k
  SourceLocation TokLocation =
785
1.39k
      FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
786
1.39k
  FormatTok->Tok.setLocation(TokLocation);
787
1.39k
  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
788
1.39k
  FormatTok->TokenText = TokenText;
789
1.39k
  FormatTok->ColumnWidth = 1;
790
1.39k
  FormatTok->OriginalColumn = OriginalColumn + 1;
791
1.39k
792
1.39k
  return FormatTok;
793
1.39k
}
794
795
563k
FormatToken *FormatTokenLexer::getNextToken() {
796
563k
  if (StateStack.top() == LexerState::TOKEN_STASHED) {
797
1.39k
    StateStack.pop();
798
1.39k
    return getStashedToken();
799
1.39k
  }
800
562k
801
562k
  FormatTok = new (Allocator.Allocate()) FormatToken;
802
562k
  readRawToken(*FormatTok);
803
562k
  SourceLocation WhitespaceStart =
804
562k
      FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
805
562k
  FormatTok->IsFirst = IsFirstToken;
806
562k
  IsFirstToken = false;
807
562k
808
562k
  // Consume and record whitespace until we find a significant token.
809
562k
  unsigned WhitespaceLength = TrailingWhitespace;
810
817k
  while (FormatTok->Tok.is(tok::unknown)) {
811
255k
    StringRef Text = FormatTok->TokenText;
812
255k
    auto EscapesNewline = [&](int pos) {
813
57.6k
      // A '\r' here is just part of '\r\n'. Skip it.
814
57.6k
      if (pos >= 0 && 
Text[pos] == '\r'2.58k
)
815
102
        --pos;
816
57.6k
      // See whether there is an odd number of '\' before this.
817
57.6k
      // FIXME: This is wrong. A '\' followed by a newline is always removed,
818
57.6k
      // regardless of whether there is another '\' before it.
819
57.6k
      // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
820
57.6k
      unsigned count = 0;
821
58.3k
      for (; pos >= 0; 
--pos, ++count701
)
822
2.51k
        if (Text[pos] != '\\')
823
1.81k
          break;
824
57.6k
      return count & 1;
825
57.6k
    };
826
255k
    // FIXME: This miscounts tok:unknown tokens that are not just
827
255k
    // whitespace, e.g. a '`' character.
828
784k
    for (int i = 0, e = Text.size(); i != e; 
++i529k
) {
829
529k
      switch (Text[i]) {
830
57.6k
      case '\n':
831
57.6k
        ++FormatTok->NewlinesBefore;
832
57.6k
        FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
833
57.6k
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
834
57.6k
        Column = 0;
835
57.6k
        break;
836
109
      case '\r':
837
109
        FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
838
109
        Column = 0;
839
109
        break;
840
20
      case '\f':
841
20
      case '\v':
842
20
        Column = 0;
843
20
        break;
844
469k
      case ' ':
845
469k
        ++Column;
846
469k
        break;
847
1.28k
      case '\t':
848
1.28k
        Column +=
849
1.28k
            Style.TabWidth - (Style.TabWidth ? 
Column % Style.TabWidth1.24k
:
036
);
850
1.28k
        break;
851
749
      case '\\':
852
749
        if (i + 1 == e || 
(701
Text[i + 1] != '\r'701
&&
Text[i + 1] != '\n'686
))
853
48
          FormatTok->Type = TT_ImplicitStringLiteral;
854
749
        break;
855
235
      default:
856
235
        FormatTok->Type = TT_ImplicitStringLiteral;
857
235
        break;
858
529k
      }
859
529k
      if (FormatTok->Type == TT_ImplicitStringLiteral)
860
283
        break;
861
529k
    }
862
255k
863
255k
    if (FormatTok->is(TT_ImplicitStringLiteral))
864
283
      break;
865
255k
    WhitespaceLength += FormatTok->Tok.getLength();
866
255k
867
255k
    readRawToken(*FormatTok);
868
255k
  }
869
562k
870
562k
  // JavaScript and Java do not allow to escape the end of the line with a
871
562k
  // backslash. Backslashes are syntax errors in plain source, but can occur in
872
562k
  // comments. When a single line comment ends with a \, it'll cause the next
873
562k
  // line of code to be lexed as a comment, breaking formatting. The code below
874
562k
  // finds comments that contain a backslash followed by a line break, truncates
875
562k
  // the comment token at the backslash, and resets the lexer to restart behind
876
562k
  // the backslash.
877
562k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
878
562k
       
Style.Language == FormatStyle::LK_Java528k
) &&
879
562k
      
FormatTok->is(tok::comment)37.8k
&&
FormatTok->TokenText.startswith("//")448
) {
880
307
    size_t BackslashPos = FormatTok->TokenText.find('\\');
881
311
    while (BackslashPos != StringRef::npos) {
882
16
      if (BackslashPos + 1 < FormatTok->TokenText.size() &&
883
16
          FormatTok->TokenText[BackslashPos + 1] == '\n') {
884
12
        const char *Offset = Lex->getBufferLocation();
885
12
        Offset -= FormatTok->TokenText.size();
886
12
        Offset += BackslashPos + 1;
887
12
        resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
888
12
        FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
889
12
        FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
890
12
            FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
891
12
            Encoding);
892
12
        break;
893
12
      }
894
4
      BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
895
4
    }
896
307
  }
897
562k
898
562k
  // In case the token starts with escaped newlines, we want to
899
562k
  // take them into account as whitespace - this pattern is quite frequent
900
562k
  // in macro definitions.
901
562k
  // FIXME: Add a more explicit test.
902
562k
  while (FormatTok->TokenText.size() > 1 && 
FormatTok->TokenText[0] == '\\'190k
) {
903
77
    unsigned SkippedWhitespace = 0;
904
77
    if (FormatTok->TokenText.size() > 2 &&
905
77
        (FormatTok->TokenText[1] == '\r' && 
FormatTok->TokenText[2] == '\n'9
))
906
9
      SkippedWhitespace = 3;
907
68
    else if (FormatTok->TokenText[1] == '\n')
908
68
      SkippedWhitespace = 2;
909
0
    else
910
0
      break;
911
77
912
77
    ++FormatTok->NewlinesBefore;
913
77
    WhitespaceLength += SkippedWhitespace;
914
77
    FormatTok->LastNewlineOffset = SkippedWhitespace;
915
77
    Column = 0;
916
77
    FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
917
77
  }
918
562k
919
562k
  FormatTok->WhitespaceRange = SourceRange(
920
562k
      WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
921
562k
922
562k
  FormatTok->OriginalColumn = Column;
923
562k
924
562k
  TrailingWhitespace = 0;
925
562k
  if (FormatTok->Tok.is(tok::comment)) {
926
9.31k
    // FIXME: Add the trimmed whitespace to Column.
927
9.31k
    StringRef UntrimmedText = FormatTok->TokenText;
928
9.31k
    FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
929
9.31k
    TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
930
552k
  } else if (FormatTok->Tok.is(tok::raw_identifier)) {
931
210k
    IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
932
210k
    FormatTok->Tok.setIdentifierInfo(&Info);
933
210k
    FormatTok->Tok.setKind(Info.getTokenID());
934
210k
    if (Style.Language == FormatStyle::LK_Java &&
935
210k
        FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
936
1.76k
                           tok::kw_operator)) {
937
8
      FormatTok->Tok.setKind(tok::identifier);
938
8
      FormatTok->Tok.setIdentifierInfo(nullptr);
939
210k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
940
210k
               FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
941
11.5k
                                  tok::kw_operator)) {
942
16
      FormatTok->Tok.setKind(tok::identifier);
943
16
      FormatTok->Tok.setIdentifierInfo(nullptr);
944
16
    }
945
342k
  } else if (FormatTok->Tok.is(tok::greatergreater)) {
946
329
    FormatTok->Tok.setKind(tok::greater);
947
329
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
948
329
    ++Column;
949
329
    StateStack.push(LexerState::TOKEN_STASHED);
950
341k
  } else if (FormatTok->Tok.is(tok::lessless)) {
951
1.07k
    FormatTok->Tok.setKind(tok::less);
952
1.07k
    FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
953
1.07k
    ++Column;
954
1.07k
    StateStack.push(LexerState::TOKEN_STASHED);
955
1.07k
  }
956
562k
957
562k
  // Now FormatTok is the next non-whitespace token.
958
562k
959
562k
  StringRef Text = FormatTok->TokenText;
960
562k
  size_t FirstNewlinePos = Text.find('\n');
961
562k
  if (FirstNewlinePos == StringRef::npos) {
962
561k
    // FIXME: ColumnWidth actually depends on the start column, we need to
963
561k
    // take this into account when the token is moved.
964
561k
    FormatTok->ColumnWidth =
965
561k
        encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
966
561k
    Column += FormatTok->ColumnWidth;
967
561k
  } else {
968
596
    FormatTok->IsMultiline = true;
969
596
    // FIXME: ColumnWidth actually depends on the start column, we need to
970
596
    // take this into account when the token is moved.
971
596
    FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
972
596
        Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
973
596
974
596
    // The last line of the token always starts in column 0.
975
596
    // Thus, the length can be precomputed even in the presence of tabs.
976
596
    FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
977
596
        Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
978
596
    Column = FormatTok->LastLineColumnWidth;
979
596
  }
980
562k
981
562k
  if (Style.isCpp()) {
982
510k
    auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
983
510k
    if (!(Tokens.size() > 0 && 
Tokens.back()->Tok.getIdentifierInfo()481k
&&
984
510k
          Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
985
192k
              tok::pp_define) &&
986
510k
        
it != Macros.end()508k
) {
987
342
      FormatTok->Type = it->second;
988
510k
    } else if (FormatTok->is(tok::identifier)) {
989
128k
      if (MacroBlockBeginRegex.match(Text)) {
990
28
        FormatTok->Type = TT_MacroBlockBegin;
991
128k
      } else if (MacroBlockEndRegex.match(Text)) {
992
28
        FormatTok->Type = TT_MacroBlockEnd;
993
28
      }
994
128k
    }
995
510k
  }
996
562k
997
562k
  return FormatTok;
998
562k
}
999
1000
817k
void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1001
817k
  Lex->LexFromRawLexer(Tok.Tok);
1002
817k
  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1003
817k
                            Tok.Tok.getLength());
1004
817k
  // For formatting, treat unterminated string literals like normal string
1005
817k
  // literals.
1006
817k
  if (Tok.is(tok::unknown)) {
1007
255k
    if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1008
27
      Tok.Tok.setKind(tok::string_literal);
1009
27
      Tok.IsUnterminatedLiteral = true;
1010
255k
    } else if (Style.Language == FormatStyle::LK_JavaScript &&
1011
255k
               
Tok.TokenText == "''"15.1k
) {
1012
12
      Tok.Tok.setKind(tok::string_literal);
1013
12
    }
1014
255k
  }
1015
817k
1016
817k
  if ((Style.Language == FormatStyle::LK_JavaScript ||
1017
817k
       
Style.Language == FormatStyle::LK_Proto768k
||
1018
817k
       
Style.Language == FormatStyle::LK_TextProto762k
) &&
1019
817k
      
Tok.is(tok::char_constant)65.1k
) {
1020
728
    Tok.Tok.setKind(tok::string_literal);
1021
728
  }
1022
817k
1023
817k
  if (Tok.is(tok::comment) && 
(9.31k
Tok.TokenText == "// clang-format on"9.31k
||
1024
9.31k
                               
Tok.TokenText == "/* clang-format on */"9.29k
)) {
1025
26
    FormattingDisabled = false;
1026
26
  }
1027
817k
1028
817k
  Tok.Finalized = FormattingDisabled;
1029
817k
1030
817k
  if (Tok.is(tok::comment) && 
(9.31k
Tok.TokenText == "// clang-format off"9.31k
||
1031
9.31k
                               
Tok.TokenText == "/* clang-format off */"9.29k
)) {
1032
29
    FormattingDisabled = true;
1033
29
  }
1034
817k
}
1035
1036
639
void FormatTokenLexer::resetLexer(unsigned Offset) {
1037
639
  StringRef Buffer = SourceMgr.getBufferData(ID);
1038
639
  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1039
639
                      getFormattingLangOpts(Style), Buffer.begin(),
1040
639
                      Buffer.begin() + Offset, Buffer.end()));
1041
639
  Lex->SetKeepWhitespaceMode(true);
1042
639
  TrailingWhitespace = 0;
1043
639
}
1044
1045
} // namespace format
1046
} // namespace clang