clang  19.0.0git
FormatTokenLexer.cpp
Go to the documentation of this file.
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
26  const SourceManager &SourceMgr, FileID ID, unsigned Column,
28  llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29  IdentifierTable &IdentTable)
30  : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31  Column(Column), TrailingWhitespace(0),
32  LangOpts(getFormattingLangOpts(Style)), SourceMgr(SourceMgr), ID(ID),
33  Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
34  Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
35  FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
36  MacroBlockEndRegex(Style.MacroBlockEnd) {
37  Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr, LangOpts));
38  Lex->SetKeepWhitespaceMode(true);
39 
40  for (const std::string &ForEachMacro : Style.ForEachMacros) {
41  auto Identifier = &IdentTable.get(ForEachMacro);
42  Macros.insert({Identifier, TT_ForEachMacro});
43  }
44  for (const std::string &IfMacro : Style.IfMacros) {
45  auto Identifier = &IdentTable.get(IfMacro);
46  Macros.insert({Identifier, TT_IfMacro});
47  }
48  for (const std::string &AttributeMacro : Style.AttributeMacros) {
49  auto Identifier = &IdentTable.get(AttributeMacro);
50  Macros.insert({Identifier, TT_AttributeMacro});
51  }
52  for (const std::string &StatementMacro : Style.StatementMacros) {
53  auto Identifier = &IdentTable.get(StatementMacro);
54  Macros.insert({Identifier, TT_StatementMacro});
55  }
56  for (const std::string &TypenameMacro : Style.TypenameMacros) {
57  auto Identifier = &IdentTable.get(TypenameMacro);
58  Macros.insert({Identifier, TT_TypenameMacro});
59  }
60  for (const std::string &NamespaceMacro : Style.NamespaceMacros) {
61  auto Identifier = &IdentTable.get(NamespaceMacro);
62  Macros.insert({Identifier, TT_NamespaceMacro});
63  }
64  for (const std::string &WhitespaceSensitiveMacro :
66  auto Identifier = &IdentTable.get(WhitespaceSensitiveMacro);
67  Macros.insert({Identifier, TT_UntouchableMacroFunc});
68  }
69  for (const std::string &StatementAttributeLikeMacro :
71  auto Identifier = &IdentTable.get(StatementAttributeLikeMacro);
72  Macros.insert({Identifier, TT_StatementAttributeLikeMacro});
73  }
74 
75  for (const auto &TypeName : Style.TypeNames)
76  TypeNames.insert(&IdentTable.get(TypeName));
77 }
78 
80  assert(Tokens.empty());
81  assert(FirstInLineIndex == 0);
82  do {
83  Tokens.push_back(getNextToken());
84  if (Style.isJavaScript()) {
85  tryParseJSRegexLiteral();
86  handleTemplateStrings();
87  }
89  tryParsePythonComment();
90  tryMergePreviousTokens();
91  if (Style.isCSharp()) {
92  // This needs to come after tokens have been merged so that C#
93  // string literals are correctly identified.
94  handleCSharpVerbatimAndInterpolatedStrings();
95  }
96  if (Style.isTableGen()) {
97  handleTableGenMultilineString();
98  handleTableGenNumericLikeIdentifier();
99  }
100  if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
101  FirstInLineIndex = Tokens.size() - 1;
102  } while (Tokens.back()->isNot(tok::eof));
103  return Tokens;
104 }
105 
106 void FormatTokenLexer::tryMergePreviousTokens() {
107  if (tryMerge_TMacro())
108  return;
109  if (tryMergeConflictMarkers())
110  return;
111  if (tryMergeLessLess())
112  return;
113  if (tryMergeGreaterGreater())
114  return;
115  if (tryMergeForEach())
116  return;
117  if (Style.isCpp() && tryTransformTryUsageForC())
118  return;
119 
120  if (Style.isJavaScript() || Style.isCSharp()) {
121  static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
122  tok::question};
123  static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
124  tok::period};
125  static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
126 
127  if (tryMergeTokens(FatArrow, TT_FatArrow))
128  return;
129  if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
130  // Treat like the "||" operator (as opposed to the ternary ?).
131  Tokens.back()->Tok.setKind(tok::pipepipe);
132  return;
133  }
134  if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
135  // Treat like a regular "." access.
136  Tokens.back()->Tok.setKind(tok::period);
137  return;
138  }
139  if (tryMergeNullishCoalescingEqual())
140  return;
141  }
142 
143  if (Style.isCSharp()) {
144  static const tok::TokenKind CSharpNullConditionalLSquare[] = {
145  tok::question, tok::l_square};
146 
147  if (tryMergeCSharpKeywordVariables())
148  return;
149  if (tryMergeCSharpStringLiteral())
150  return;
151  if (tryTransformCSharpForEach())
152  return;
153  if (tryMergeTokens(CSharpNullConditionalLSquare,
154  TT_CSharpNullConditionalLSquare)) {
155  // Treat like a regular "[" operator.
156  Tokens.back()->Tok.setKind(tok::l_square);
157  return;
158  }
159  }
160 
161  if (tryMergeNSStringLiteral())
162  return;
163 
164  if (Style.isJavaScript()) {
165  static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
166  static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
167  tok::equal};
168  static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
169  tok::greaterequal};
170  static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
171  static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
172  tok::starequal};
173  static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
174  static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
175 
176  // FIXME: Investigate what token type gives the correct operator priority.
177  if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
178  return;
179  if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
180  return;
181  if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
182  return;
183  if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
184  return;
185  if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
186  Tokens.back()->Tok.setKind(tok::starequal);
187  return;
188  }
189  if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
190  tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
191  // Treat like the "=" assignment operator.
192  Tokens.back()->Tok.setKind(tok::equal);
193  return;
194  }
195  if (tryMergeJSPrivateIdentifier())
196  return;
197  }
198 
199  if (Style.Language == FormatStyle::LK_Java) {
200  static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
201  tok::greater, tok::greater, tok::greaterequal};
202  if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
203  return;
204  }
205 
206  if (Style.isVerilog()) {
207  // Merge the number following a base like `'h?a0`.
208  if (Tokens.size() >= 3 && Tokens.end()[-3]->is(TT_VerilogNumberBase) &&
209  Tokens.end()[-2]->is(tok::numeric_constant) &&
210  Tokens.back()->isOneOf(tok::numeric_constant, tok::identifier,
211  tok::question) &&
212  tryMergeTokens(2, TT_Unknown)) {
213  return;
214  }
215  // Part select.
216  if (tryMergeTokensAny({{tok::minus, tok::colon}, {tok::plus, tok::colon}},
217  TT_BitFieldColon)) {
218  return;
219  }
220  // Xnor. The combined token is treated as a caret which can also be either a
221  // unary or binary operator. The actual type is determined in
222  // TokenAnnotator. We also check the token length so we know it is not
223  // already a merged token.
224  if (Tokens.back()->TokenText.size() == 1 &&
225  tryMergeTokensAny({{tok::caret, tok::tilde}, {tok::tilde, tok::caret}},
226  TT_BinaryOperator)) {
227  Tokens.back()->Tok.setKind(tok::caret);
228  return;
229  }
230  // Signed shift and distribution weight.
231  if (tryMergeTokens({tok::less, tok::less}, TT_BinaryOperator)) {
232  Tokens.back()->Tok.setKind(tok::lessless);
233  return;
234  }
235  if (tryMergeTokens({tok::greater, tok::greater}, TT_BinaryOperator)) {
236  Tokens.back()->Tok.setKind(tok::greatergreater);
237  return;
238  }
239  if (tryMergeTokensAny({{tok::lessless, tok::equal},
240  {tok::lessless, tok::lessequal},
241  {tok::greatergreater, tok::equal},
242  {tok::greatergreater, tok::greaterequal},
243  {tok::colon, tok::equal},
244  {tok::colon, tok::slash}},
245  TT_BinaryOperator)) {
246  Tokens.back()->ForcedPrecedence = prec::Assignment;
247  return;
248  }
249  // Exponentiation, signed shift, case equality, and wildcard equality.
250  if (tryMergeTokensAny({{tok::star, tok::star},
251  {tok::lessless, tok::less},
252  {tok::greatergreater, tok::greater},
253  {tok::exclaimequal, tok::equal},
254  {tok::exclaimequal, tok::question},
255  {tok::equalequal, tok::equal},
256  {tok::equalequal, tok::question}},
257  TT_BinaryOperator)) {
258  return;
259  }
260  // Module paths in specify blocks and the implication and boolean equality
261  // operators.
262  if (tryMergeTokensAny({{tok::plusequal, tok::greater},
263  {tok::plus, tok::star, tok::greater},
264  {tok::minusequal, tok::greater},
265  {tok::minus, tok::star, tok::greater},
266  {tok::less, tok::arrow},
267  {tok::equal, tok::greater},
268  {tok::star, tok::greater},
269  {tok::pipeequal, tok::greater},
270  {tok::pipe, tok::arrow},
271  {tok::hash, tok::minus, tok::hash},
272  {tok::hash, tok::equal, tok::hash}},
273  TT_BinaryOperator) ||
274  Tokens.back()->is(tok::arrow)) {
275  Tokens.back()->ForcedPrecedence = prec::Comma;
276  return;
277  }
278  }
279  if (Style.isTableGen()) {
280  // TableGen's Multi line string starts with [{
281  if (tryMergeTokens({tok::l_square, tok::l_brace},
282  TT_TableGenMultiLineString)) {
283  // Set again with finalizing. This must never be annotated as other types.
284  Tokens.back()->setFinalizedType(TT_TableGenMultiLineString);
285  Tokens.back()->Tok.setKind(tok::string_literal);
286  return;
287  }
288  // TableGen's bang operator is the form !<name>.
289  // !cond is a special case with specific syntax.
290  if (tryMergeTokens({tok::exclaim, tok::identifier},
291  TT_TableGenBangOperator)) {
292  Tokens.back()->Tok.setKind(tok::identifier);
293  Tokens.back()->Tok.setIdentifierInfo(nullptr);
294  if (Tokens.back()->TokenText == "!cond")
295  Tokens.back()->setFinalizedType(TT_TableGenCondOperator);
296  else
297  Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
298  return;
299  }
300  if (tryMergeTokens({tok::exclaim, tok::kw_if}, TT_TableGenBangOperator)) {
301  // Here, "! if" becomes "!if". That is, ! captures if even when the space
302  // exists. That is only one possibility in TableGen's syntax.
303  Tokens.back()->Tok.setKind(tok::identifier);
304  Tokens.back()->Tok.setIdentifierInfo(nullptr);
305  Tokens.back()->setFinalizedType(TT_TableGenBangOperator);
306  return;
307  }
308  // +, - with numbers are literals. Not unary operators.
309  if (tryMergeTokens({tok::plus, tok::numeric_constant}, TT_Unknown)) {
310  Tokens.back()->Tok.setKind(tok::numeric_constant);
311  return;
312  }
313  if (tryMergeTokens({tok::minus, tok::numeric_constant}, TT_Unknown)) {
314  Tokens.back()->Tok.setKind(tok::numeric_constant);
315  return;
316  }
317  }
318 }
319 
320 bool FormatTokenLexer::tryMergeNSStringLiteral() {
321  if (Tokens.size() < 2)
322  return false;
323  auto &At = *(Tokens.end() - 2);
324  auto &String = *(Tokens.end() - 1);
325  if (At->isNot(tok::at) || String->isNot(tok::string_literal))
326  return false;
327  At->Tok.setKind(tok::string_literal);
328  At->TokenText = StringRef(At->TokenText.begin(),
329  String->TokenText.end() - At->TokenText.begin());
330  At->ColumnWidth += String->ColumnWidth;
331  At->setType(TT_ObjCStringLiteral);
332  Tokens.erase(Tokens.end() - 1);
333  return true;
334 }
335 
336 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
337  // Merges #idenfier into a single identifier with the text #identifier
338  // but the token tok::identifier.
339  if (Tokens.size() < 2)
340  return false;
341  auto &Hash = *(Tokens.end() - 2);
342  auto &Identifier = *(Tokens.end() - 1);
343  if (Hash->isNot(tok::hash) || Identifier->isNot(tok::identifier))
344  return false;
345  Hash->Tok.setKind(tok::identifier);
346  Hash->TokenText =
347  StringRef(Hash->TokenText.begin(),
348  Identifier->TokenText.end() - Hash->TokenText.begin());
349  Hash->ColumnWidth += Identifier->ColumnWidth;
350  Hash->setType(TT_JsPrivateIdentifier);
351  Tokens.erase(Tokens.end() - 1);
352  return true;
353 }
354 
355 // Search for verbatim or interpolated string literals @"ABC" or
356 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
357 // prevent splitting of @, $ and ".
358 // Merging of multiline verbatim strings with embedded '"' is handled in
359 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
360 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
361  if (Tokens.size() < 2)
362  return false;
363 
364  // Look for @"aaaaaa" or $"aaaaaa".
365  const auto String = *(Tokens.end() - 1);
366  if (String->isNot(tok::string_literal))
367  return false;
368 
369  auto Prefix = *(Tokens.end() - 2);
370  if (Prefix->isNot(tok::at) && Prefix->TokenText != "$")
371  return false;
372 
373  if (Tokens.size() > 2) {
374  const auto Tok = *(Tokens.end() - 3);
375  if ((Tok->TokenText == "$" && Prefix->is(tok::at)) ||
376  (Tok->is(tok::at) && Prefix->TokenText == "$")) {
377  // This looks like $@"aaa" or @$"aaa" so we need to combine all 3 tokens.
378  Tok->ColumnWidth += Prefix->ColumnWidth;
379  Tokens.erase(Tokens.end() - 2);
380  Prefix = Tok;
381  }
382  }
383 
384  // Convert back into just a string_literal.
385  Prefix->Tok.setKind(tok::string_literal);
386  Prefix->TokenText =
387  StringRef(Prefix->TokenText.begin(),
388  String->TokenText.end() - Prefix->TokenText.begin());
389  Prefix->ColumnWidth += String->ColumnWidth;
390  Prefix->setType(TT_CSharpStringLiteral);
391  Tokens.erase(Tokens.end() - 1);
392  return true;
393 }
394 
395 // Valid C# attribute targets:
396 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
397 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
398  "assembly", "module", "field", "event", "method",
399  "param", "property", "return", "type",
400 };
401 
402 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
403  if (Tokens.size() < 2)
404  return false;
405  auto &NullishCoalescing = *(Tokens.end() - 2);
406  auto &Equal = *(Tokens.end() - 1);
407  if (NullishCoalescing->isNot(TT_NullCoalescingOperator) ||
408  Equal->isNot(tok::equal)) {
409  return false;
410  }
411  NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
412  NullishCoalescing->TokenText =
413  StringRef(NullishCoalescing->TokenText.begin(),
414  Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
415  NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
416  NullishCoalescing->setType(TT_NullCoalescingEqual);
417  Tokens.erase(Tokens.end() - 1);
418  return true;
419 }
420 
421 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
422  if (Tokens.size() < 2)
423  return false;
424  const auto At = *(Tokens.end() - 2);
425  if (At->isNot(tok::at))
426  return false;
427  const auto Keyword = *(Tokens.end() - 1);
428  if (Keyword->TokenText == "$")
429  return false;
430  if (!Keywords.isCSharpKeyword(*Keyword))
431  return false;
432 
433  At->Tok.setKind(tok::identifier);
434  At->TokenText = StringRef(At->TokenText.begin(),
435  Keyword->TokenText.end() - At->TokenText.begin());
436  At->ColumnWidth += Keyword->ColumnWidth;
437  At->setType(Keyword->getType());
438  Tokens.erase(Tokens.end() - 1);
439  return true;
440 }
441 
442 // In C# transform identifier foreach into kw_foreach
443 bool FormatTokenLexer::tryTransformCSharpForEach() {
444  if (Tokens.size() < 1)
445  return false;
446  auto &Identifier = *(Tokens.end() - 1);
447  if (Identifier->isNot(tok::identifier))
448  return false;
449  if (Identifier->TokenText != "foreach")
450  return false;
451 
452  Identifier->setType(TT_ForEachMacro);
453  Identifier->Tok.setKind(tok::kw_for);
454  return true;
455 }
456 
457 bool FormatTokenLexer::tryMergeForEach() {
458  if (Tokens.size() < 2)
459  return false;
460  auto &For = *(Tokens.end() - 2);
461  auto &Each = *(Tokens.end() - 1);
462  if (For->isNot(tok::kw_for))
463  return false;
464  if (Each->isNot(tok::identifier))
465  return false;
466  if (Each->TokenText != "each")
467  return false;
468 
469  For->setType(TT_ForEachMacro);
470  For->Tok.setKind(tok::kw_for);
471 
472  For->TokenText = StringRef(For->TokenText.begin(),
473  Each->TokenText.end() - For->TokenText.begin());
474  For->ColumnWidth += Each->ColumnWidth;
475  Tokens.erase(Tokens.end() - 1);
476  return true;
477 }
478 
479 bool FormatTokenLexer::tryTransformTryUsageForC() {
480  if (Tokens.size() < 2)
481  return false;
482  auto &Try = *(Tokens.end() - 2);
483  if (Try->isNot(tok::kw_try))
484  return false;
485  auto &Next = *(Tokens.end() - 1);
486  if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
487  return false;
488 
489  if (Tokens.size() > 2) {
490  auto &At = *(Tokens.end() - 3);
491  if (At->is(tok::at))
492  return false;
493  }
494 
495  Try->Tok.setKind(tok::identifier);
496  return true;
497 }
498 
499 bool FormatTokenLexer::tryMergeLessLess() {
500  // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
501  if (Tokens.size() < 3)
502  return false;
503 
504  auto First = Tokens.end() - 3;
505  if (First[0]->isNot(tok::less) || First[1]->isNot(tok::less))
506  return false;
507 
508  // Only merge if there currently is no whitespace between the two "<".
509  if (First[1]->hasWhitespaceBefore())
510  return false;
511 
512  auto X = Tokens.size() > 3 ? First[-1] : nullptr;
513  if (X && X->is(tok::less))
514  return false;
515 
516  auto Y = First[2];
517  if ((!X || X->isNot(tok::kw_operator)) && Y->is(tok::less))
518  return false;
519 
520  First[0]->Tok.setKind(tok::lessless);
521  First[0]->TokenText = "<<";
522  First[0]->ColumnWidth += 1;
523  Tokens.erase(Tokens.end() - 2);
524  return true;
525 }
526 
527 bool FormatTokenLexer::tryMergeGreaterGreater() {
528  // Merge kw_operator,greater,greater into kw_operator,greatergreater.
529  if (Tokens.size() < 2)
530  return false;
531 
532  auto First = Tokens.end() - 2;
533  if (First[0]->isNot(tok::greater) || First[1]->isNot(tok::greater))
534  return false;
535 
536  // Only merge if there currently is no whitespace between the first two ">".
537  if (First[1]->hasWhitespaceBefore())
538  return false;
539 
540  auto Tok = Tokens.size() > 2 ? First[-1] : nullptr;
541  if (Tok && Tok->isNot(tok::kw_operator))
542  return false;
543 
544  First[0]->Tok.setKind(tok::greatergreater);
545  First[0]->TokenText = ">>";
546  First[0]->ColumnWidth += 1;
547  Tokens.erase(Tokens.end() - 1);
548  return true;
549 }
550 
551 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
552  TokenType NewType) {
553  if (Tokens.size() < Kinds.size())
554  return false;
555 
556  SmallVectorImpl<FormatToken *>::const_iterator First =
557  Tokens.end() - Kinds.size();
558  for (unsigned i = 0; i < Kinds.size(); ++i)
559  if (First[i]->isNot(Kinds[i]))
560  return false;
561 
562  return tryMergeTokens(Kinds.size(), NewType);
563 }
564 
565 bool FormatTokenLexer::tryMergeTokens(size_t Count, TokenType NewType) {
566  if (Tokens.size() < Count)
567  return false;
568 
569  SmallVectorImpl<FormatToken *>::const_iterator First = Tokens.end() - Count;
570  unsigned AddLength = 0;
571  for (size_t i = 1; i < Count; ++i) {
572  // If there is whitespace separating the token and the previous one,
573  // they should not be merged.
574  if (First[i]->hasWhitespaceBefore())
575  return false;
576  AddLength += First[i]->TokenText.size();
577  }
578 
579  Tokens.resize(Tokens.size() - Count + 1);
580  First[0]->TokenText = StringRef(First[0]->TokenText.data(),
581  First[0]->TokenText.size() + AddLength);
582  First[0]->ColumnWidth += AddLength;
583  First[0]->setType(NewType);
584  return true;
585 }
586 
587 bool FormatTokenLexer::tryMergeTokensAny(
588  ArrayRef<ArrayRef<tok::TokenKind>> Kinds, TokenType NewType) {
589  return llvm::any_of(Kinds, [this, NewType](ArrayRef<tok::TokenKind> Kinds) {
590  return tryMergeTokens(Kinds, NewType);
591  });
592 }
593 
594 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
595 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
596  // NB: This is not entirely correct, as an r_paren can introduce an operand
597  // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
598  // corner case to not matter in practice, though.
599  return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
600  tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
601  tok::colon, tok::question, tok::tilde) ||
602  Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
603  tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
604  tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
605  Tok->isBinaryOperator();
606 }
607 
608 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
609  if (!Prev)
610  return true;
611 
612  // Regex literals can only follow after prefix unary operators, not after
613  // postfix unary operators. If the '++' is followed by a non-operand
614  // introducing token, the slash here is the operand and not the start of a
615  // regex.
616  // `!` is an unary prefix operator, but also a post-fix operator that casts
617  // away nullability, so the same check applies.
618  if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
619  return Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]);
620 
621  // The previous token must introduce an operand location where regex
622  // literals can occur.
623  if (!precedesOperand(Prev))
624  return false;
625 
626  return true;
627 }
628 
629 // Tries to parse a JavaScript Regex literal starting at the current token,
630 // if that begins with a slash and is in a location where JavaScript allows
631 // regex literals. Changes the current token to a regex literal and updates
632 // its text if successful.
633 void FormatTokenLexer::tryParseJSRegexLiteral() {
634  FormatToken *RegexToken = Tokens.back();
635  if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
636  return;
637 
638  FormatToken *Prev = nullptr;
639  for (FormatToken *FT : llvm::drop_begin(llvm::reverse(Tokens))) {
640  // NB: Because previous pointers are not initialized yet, this cannot use
641  // Token.getPreviousNonComment.
642  if (FT->isNot(tok::comment)) {
643  Prev = FT;
644  break;
645  }
646  }
647 
648  if (!canPrecedeRegexLiteral(Prev))
649  return;
650 
651  // 'Manually' lex ahead in the current file buffer.
652  const char *Offset = Lex->getBufferLocation();
653  const char *RegexBegin = Offset - RegexToken->TokenText.size();
654  StringRef Buffer = Lex->getBuffer();
655  bool InCharacterClass = false;
656  bool HaveClosingSlash = false;
657  for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
658  // Regular expressions are terminated with a '/', which can only be
659  // escaped using '\' or a character class between '[' and ']'.
660  // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
661  switch (*Offset) {
662  case '\\':
663  // Skip the escaped character.
664  ++Offset;
665  break;
666  case '[':
667  InCharacterClass = true;
668  break;
669  case ']':
670  InCharacterClass = false;
671  break;
672  case '/':
673  if (!InCharacterClass)
674  HaveClosingSlash = true;
675  break;
676  }
677  }
678 
679  RegexToken->setType(TT_RegexLiteral);
680  // Treat regex literals like other string_literals.
681  RegexToken->Tok.setKind(tok::string_literal);
682  RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
683  RegexToken->ColumnWidth = RegexToken->TokenText.size();
684 
685  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
686 }
687 
688 static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim,
689  bool Interpolated) {
690  auto Repeated = [&Begin, End]() {
691  return Begin + 1 < End && Begin[1] == Begin[0];
692  };
693 
694  // Look for a terminating '"' in the current file buffer.
695  // Make no effort to format code within an interpolated or verbatim string.
696  //
697  // Interpolated strings could contain { } with " characters inside.
698  // $"{x ?? "null"}"
699  // should not be split into $"{x ?? ", null, "}" but should be treated as a
700  // single string-literal.
701  //
702  // We opt not to try and format expressions inside {} within a C#
703  // interpolated string. Formatting expressions within an interpolated string
704  // would require similar work as that done for JavaScript template strings
705  // in `handleTemplateStrings()`.
706  for (int UnmatchedOpeningBraceCount = 0; Begin < End; ++Begin) {
707  switch (*Begin) {
708  case '\\':
709  if (!Verbatim)
710  ++Begin;
711  break;
712  case '{':
713  if (Interpolated) {
714  // {{ inside an interpolated string is escaped, so skip it.
715  if (Repeated())
716  ++Begin;
717  else
718  ++UnmatchedOpeningBraceCount;
719  }
720  break;
721  case '}':
722  if (Interpolated) {
723  // }} inside an interpolated string is escaped, so skip it.
724  if (Repeated())
725  ++Begin;
726  else if (UnmatchedOpeningBraceCount > 0)
727  --UnmatchedOpeningBraceCount;
728  else
729  return End;
730  }
731  break;
732  case '"':
733  if (UnmatchedOpeningBraceCount > 0)
734  break;
735  // "" within a verbatim string is an escaped double quote: skip it.
736  if (Verbatim && Repeated()) {
737  ++Begin;
738  break;
739  }
740  return Begin;
741  }
742  }
743 
744  return End;
745 }
746 
747 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
748  FormatToken *CSharpStringLiteral = Tokens.back();
749 
750  if (CSharpStringLiteral->isNot(TT_CSharpStringLiteral))
751  return;
752 
753  auto &TokenText = CSharpStringLiteral->TokenText;
754 
755  bool Verbatim = false;
756  bool Interpolated = false;
757  if (TokenText.starts_with(R"($@")") || TokenText.starts_with(R"(@$")")) {
758  Verbatim = true;
759  Interpolated = true;
760  } else if (TokenText.starts_with(R"(@")")) {
761  Verbatim = true;
762  } else if (TokenText.starts_with(R"($")")) {
763  Interpolated = true;
764  }
765 
766  // Deal with multiline strings.
767  if (!Verbatim && !Interpolated)
768  return;
769 
770  const char *StrBegin = Lex->getBufferLocation() - TokenText.size();
771  const char *Offset = StrBegin;
772  if (Verbatim && Interpolated)
773  Offset += 3;
774  else
775  Offset += 2;
776 
777  const auto End = Lex->getBuffer().end();
778  Offset = lexCSharpString(Offset, End, Verbatim, Interpolated);
779 
780  // Make no attempt to format code properly if a verbatim string is
781  // unterminated.
782  if (Offset >= End)
783  return;
784 
785  StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
786  TokenText = LiteralText;
787 
788  // Adjust width for potentially multiline string literals.
789  size_t FirstBreak = LiteralText.find('\n');
790  StringRef FirstLineText = FirstBreak == StringRef::npos
791  ? LiteralText
792  : LiteralText.substr(0, FirstBreak);
793  CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
794  FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
795  Encoding);
796  size_t LastBreak = LiteralText.rfind('\n');
797  if (LastBreak != StringRef::npos) {
798  CSharpStringLiteral->IsMultiline = true;
799  unsigned StartColumn = 0;
800  CSharpStringLiteral->LastLineColumnWidth =
801  encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
802  StartColumn, Style.TabWidth, Encoding);
803  }
804 
805  assert(Offset < End);
806  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset + 1)));
807 }
808 
809 void FormatTokenLexer::handleTableGenMultilineString() {
810  FormatToken *MultiLineString = Tokens.back();
811  if (MultiLineString->isNot(TT_TableGenMultiLineString))
812  return;
813 
814  auto OpenOffset = Lex->getCurrentBufferOffset() - 2 /* "[{" */;
815  // "}]" is the end of multi line string.
816  auto CloseOffset = Lex->getBuffer().find("}]", OpenOffset);
817  if (CloseOffset == StringRef::npos)
818  return;
819  auto Text = Lex->getBuffer().substr(OpenOffset, CloseOffset - OpenOffset + 2);
820  MultiLineString->TokenText = Text;
821  resetLexer(SourceMgr.getFileOffset(
822  Lex->getSourceLocation(Lex->getBufferLocation() - 2 + Text.size())));
823  auto FirstLineText = Text;
824  auto FirstBreak = Text.find('\n');
825  // Set ColumnWidth and LastLineColumnWidth when it has multiple lines.
826  if (FirstBreak != StringRef::npos) {
827  MultiLineString->IsMultiline = true;
828  FirstLineText = Text.substr(0, FirstBreak + 1);
829  // LastLineColumnWidth holds the width of the last line.
830  auto LastBreak = Text.rfind('\n');
831  MultiLineString->LastLineColumnWidth = encoding::columnWidthWithTabs(
832  Text.substr(LastBreak + 1), MultiLineString->OriginalColumn,
833  Style.TabWidth, Encoding);
834  }
835  // ColumnWidth holds only the width of the first line.
836  MultiLineString->ColumnWidth = encoding::columnWidthWithTabs(
837  FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
838 }
839 
840 void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
841  FormatToken *Tok = Tokens.back();
842  // TableGen identifiers can begin with digits. Such tokens are lexed as
843  // numeric_constant now.
844  if (Tok->isNot(tok::numeric_constant))
845  return;
846  StringRef Text = Tok->TokenText;
847  // The following check is based on llvm::TGLexer::LexToken.
848  // That lexes the token as a number if any of the following holds:
849  // 1. It starts with '+', '-'.
850  // 2. All the characters are digits.
851  // 3. The first non-digit character is 'b', and the next is '0' or '1'.
852  // 4. The first non-digit character is 'x', and the next is a hex digit.
853  // Note that in the case 3 and 4, if the next character does not exists in
854  // this token, the token is an identifier.
855  if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
856  return;
857  const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
858  // All the characters are digits
859  if (NonDigitPos == StringRef::npos)
860  return;
861  char FirstNonDigit = Text[NonDigitPos];
862  if (NonDigitPos < Text.size() - 1) {
863  char TheNext = Text[NonDigitPos + 1];
864  // Regarded as a binary number.
865  if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
866  return;
867  // Regarded as hex number.
868  if (FirstNonDigit == 'x' && isxdigit(TheNext))
869  return;
870  }
871  if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
872  // This is actually an identifier in TableGen.
873  Tok->Tok.setKind(tok::identifier);
874  Tok->Tok.setIdentifierInfo(nullptr);
875  }
876 }
877 
878 void FormatTokenLexer::handleTemplateStrings() {
879  FormatToken *BacktickToken = Tokens.back();
880 
881  if (BacktickToken->is(tok::l_brace)) {
882  StateStack.push(LexerState::NORMAL);
883  return;
884  }
885  if (BacktickToken->is(tok::r_brace)) {
886  if (StateStack.size() == 1)
887  return;
888  StateStack.pop();
889  if (StateStack.top() != LexerState::TEMPLATE_STRING)
890  return;
891  // If back in TEMPLATE_STRING, fallthrough and continue parsing the
892  } else if (BacktickToken->is(tok::unknown) &&
893  BacktickToken->TokenText == "`") {
894  StateStack.push(LexerState::TEMPLATE_STRING);
895  } else {
896  return; // Not actually a template
897  }
898 
899  // 'Manually' lex ahead in the current file buffer.
900  const char *Offset = Lex->getBufferLocation();
901  const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
902  for (; Offset != Lex->getBuffer().end(); ++Offset) {
903  if (Offset[0] == '`') {
904  StateStack.pop();
905  ++Offset;
906  break;
907  }
908  if (Offset[0] == '\\') {
909  ++Offset; // Skip the escaped character.
910  } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
911  Offset[1] == '{') {
912  // '${' introduces an expression interpolation in the template string.
913  StateStack.push(LexerState::NORMAL);
914  Offset += 2;
915  break;
916  }
917  }
918 
919  StringRef LiteralText(TmplBegin, Offset - TmplBegin);
920  BacktickToken->setType(TT_TemplateString);
921  BacktickToken->Tok.setKind(tok::string_literal);
922  BacktickToken->TokenText = LiteralText;
923 
924  // Adjust width for potentially multiline string literals.
925  size_t FirstBreak = LiteralText.find('\n');
926  StringRef FirstLineText = FirstBreak == StringRef::npos
927  ? LiteralText
928  : LiteralText.substr(0, FirstBreak);
929  BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
930  FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
931  size_t LastBreak = LiteralText.rfind('\n');
932  if (LastBreak != StringRef::npos) {
933  BacktickToken->IsMultiline = true;
934  unsigned StartColumn = 0; // The template tail spans the entire line.
935  BacktickToken->LastLineColumnWidth =
936  encoding::columnWidthWithTabs(LiteralText.substr(LastBreak + 1),
937  StartColumn, Style.TabWidth, Encoding);
938  }
939 
940  SourceLocation loc = Lex->getSourceLocation(Offset);
941  resetLexer(SourceMgr.getFileOffset(loc));
942 }
943 
944 void FormatTokenLexer::tryParsePythonComment() {
945  FormatToken *HashToken = Tokens.back();
946  if (!HashToken->isOneOf(tok::hash, tok::hashhash))
947  return;
948  // Turn the remainder of this line into a comment.
949  const char *CommentBegin =
950  Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
951  size_t From = CommentBegin - Lex->getBuffer().begin();
952  size_t To = Lex->getBuffer().find_first_of('\n', From);
953  if (To == StringRef::npos)
954  To = Lex->getBuffer().size();
955  size_t Len = To - From;
956  HashToken->setType(TT_LineComment);
957  HashToken->Tok.setKind(tok::comment);
958  HashToken->TokenText = Lex->getBuffer().substr(From, Len);
959  SourceLocation Loc = To < Lex->getBuffer().size()
960  ? Lex->getSourceLocation(CommentBegin + Len)
961  : SourceMgr.getLocForEndOfFile(ID);
962  resetLexer(SourceMgr.getFileOffset(Loc));
963 }
964 
965 bool FormatTokenLexer::tryMerge_TMacro() {
966  if (Tokens.size() < 4)
967  return false;
968  FormatToken *Last = Tokens.back();
969  if (Last->isNot(tok::r_paren))
970  return false;
971 
972  FormatToken *String = Tokens[Tokens.size() - 2];
973  if (String->isNot(tok::string_literal) || String->IsMultiline)
974  return false;
975 
976  if (Tokens[Tokens.size() - 3]->isNot(tok::l_paren))
977  return false;
978 
979  FormatToken *Macro = Tokens[Tokens.size() - 4];
980  if (Macro->TokenText != "_T")
981  return false;
982 
983  const char *Start = Macro->TokenText.data();
984  const char *End = Last->TokenText.data() + Last->TokenText.size();
985  String->TokenText = StringRef(Start, End - Start);
986  String->IsFirst = Macro->IsFirst;
987  String->LastNewlineOffset = Macro->LastNewlineOffset;
988  String->WhitespaceRange = Macro->WhitespaceRange;
989  String->OriginalColumn = Macro->OriginalColumn;
990  String->ColumnWidth = encoding::columnWidthWithTabs(
991  String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
992  String->NewlinesBefore = Macro->NewlinesBefore;
993  String->HasUnescapedNewline = Macro->HasUnescapedNewline;
994 
995  Tokens.pop_back();
996  Tokens.pop_back();
997  Tokens.pop_back();
998  Tokens.back() = String;
999  if (FirstInLineIndex >= Tokens.size())
1000  FirstInLineIndex = Tokens.size() - 1;
1001  return true;
1002 }
1003 
1004 bool FormatTokenLexer::tryMergeConflictMarkers() {
1005  if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
1006  return false;
1007 
1008  // Conflict lines look like:
1009  // <marker> <text from the vcs>
1010  // For example:
1011  // >>>>>>> /file/in/file/system at revision 1234
1012  //
1013  // We merge all tokens in a line that starts with a conflict marker
1014  // into a single token with a special token type that the unwrapped line
1015  // parser will use to correctly rebuild the underlying code.
1016 
1017  FileID ID;
1018  // Get the position of the first token in the line.
1019  unsigned FirstInLineOffset;
1020  std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
1021  Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
1022  StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
1023  // Calculate the offset of the start of the current line.
1024  auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
1025  if (LineOffset == StringRef::npos)
1026  LineOffset = 0;
1027  else
1028  ++LineOffset;
1029 
1030  auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
1031  StringRef LineStart;
1032  if (FirstSpace == StringRef::npos)
1033  LineStart = Buffer.substr(LineOffset);
1034  else
1035  LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
1036 
1037  TokenType Type = TT_Unknown;
1038  if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
1039  Type = TT_ConflictStart;
1040  } else if (LineStart == "|||||||" || LineStart == "=======" ||
1041  LineStart == "====") {
1042  Type = TT_ConflictAlternative;
1043  } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
1044  Type = TT_ConflictEnd;
1045  }
1046 
1047  if (Type != TT_Unknown) {
1048  FormatToken *Next = Tokens.back();
1049 
1050  Tokens.resize(FirstInLineIndex + 1);
1051  // We do not need to build a complete token here, as we will skip it
1052  // during parsing anyway (as we must not touch whitespace around conflict
1053  // markers).
1054  Tokens.back()->setType(Type);
1055  Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
1056 
1057  Tokens.push_back(Next);
1058  return true;
1059  }
1060 
1061  return false;
1062 }
1063 
1064 FormatToken *FormatTokenLexer::getStashedToken() {
1065  // Create a synthesized second '>' or '<' token.
1066  Token Tok = FormatTok->Tok;
1067  StringRef TokenText = FormatTok->TokenText;
1068 
1069  unsigned OriginalColumn = FormatTok->OriginalColumn;
1070  FormatTok = new (Allocator.Allocate()) FormatToken;
1071  FormatTok->Tok = Tok;
1072  SourceLocation TokLocation =
1073  FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
1074  FormatTok->Tok.setLocation(TokLocation);
1075  FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
1076  FormatTok->TokenText = TokenText;
1077  FormatTok->ColumnWidth = 1;
1078  FormatTok->OriginalColumn = OriginalColumn + 1;
1079 
1080  return FormatTok;
1081 }
1082 
1083 /// Truncate the current token to the new length and make the lexer continue
1084 /// from the end of the truncated token. Used for other languages that have
1085 /// different token boundaries, like JavaScript in which a comment ends at a
1086 /// line break regardless of whether the line break follows a backslash. Also
1087 /// used to set the lexer to the end of whitespace if the lexer regards
1088 /// whitespace and an unrecognized symbol as one token.
1089 void FormatTokenLexer::truncateToken(size_t NewLen) {
1090  assert(NewLen <= FormatTok->TokenText.size());
1091  resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(
1092  Lex->getBufferLocation() - FormatTok->TokenText.size() + NewLen)));
1093  FormatTok->TokenText = FormatTok->TokenText.substr(0, NewLen);
1095  FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
1096  Encoding);
1097  FormatTok->Tok.setLength(NewLen);
1098 }
1099 
1100 /// Count the length of leading whitespace in a token.
1101 static size_t countLeadingWhitespace(StringRef Text) {
1102  // Basically counting the length matched by this regex.
1103  // "^([\n\r\f\v \t]|(\\\\|\\?\\?/)[\n\r])+"
1104  // Directly using the regex turned out to be slow. With the regex
1105  // version formatting all files in this directory took about 1.25
1106  // seconds. This version took about 0.5 seconds.
1107  const unsigned char *const Begin = Text.bytes_begin();
1108  const unsigned char *const End = Text.bytes_end();
1109  const unsigned char *Cur = Begin;
1110  while (Cur < End) {
1111  if (isspace(Cur[0])) {
1112  ++Cur;
1113  } else if (Cur[0] == '\\' && (Cur[1] == '\n' || Cur[1] == '\r')) {
1114  // A '\' followed by a newline always escapes the newline, regardless
1115  // of whether there is another '\' before it.
1116  // The source has a null byte at the end. So the end of the entire input
1117  // isn't reached yet. Also the lexer doesn't break apart an escaped
1118  // newline.
1119  assert(End - Cur >= 2);
1120  Cur += 2;
1121  } else if (Cur[0] == '?' && Cur[1] == '?' && Cur[2] == '/' &&
1122  (Cur[3] == '\n' || Cur[3] == '\r')) {
1123  // Newlines can also be escaped by a '?' '?' '/' trigraph. By the way, the
1124  // characters are quoted individually in this comment because if we write
1125  // them together some compilers warn that we have a trigraph in the code.
1126  assert(End - Cur >= 4);
1127  Cur += 4;
1128  } else {
1129  break;
1130  }
1131  }
1132  return Cur - Begin;
1133 }
1134 
1135 FormatToken *FormatTokenLexer::getNextToken() {
1136  if (StateStack.top() == LexerState::TOKEN_STASHED) {
1137  StateStack.pop();
1138  return getStashedToken();
1139  }
1140 
1141  FormatTok = new (Allocator.Allocate()) FormatToken;
1142  readRawToken(*FormatTok);
1143  SourceLocation WhitespaceStart =
1144  FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
1145  FormatTok->IsFirst = IsFirstToken;
1146  IsFirstToken = false;
1147 
1148  // Consume and record whitespace until we find a significant token.
1149  // Some tok::unknown tokens are not just whitespace, e.g. whitespace
1150  // followed by a symbol such as backtick. Those symbols may be
1151  // significant in other languages.
1152  unsigned WhitespaceLength = TrailingWhitespace;
1153  while (FormatTok->isNot(tok::eof)) {
1154  auto LeadingWhitespace = countLeadingWhitespace(FormatTok->TokenText);
1155  if (LeadingWhitespace == 0)
1156  break;
1157  if (LeadingWhitespace < FormatTok->TokenText.size())
1158  truncateToken(LeadingWhitespace);
1159  StringRef Text = FormatTok->TokenText;
1160  bool InEscape = false;
1161  for (int i = 0, e = Text.size(); i != e; ++i) {
1162  switch (Text[i]) {
1163  case '\r':
1164  // If this is a CRLF sequence, break here and the LF will be handled on
1165  // the next loop iteration. Otherwise, this is a single Mac CR, treat it
1166  // the same as a single LF.
1167  if (i + 1 < e && Text[i + 1] == '\n')
1168  break;
1169  [[fallthrough]];
1170  case '\n':
1171  ++FormatTok->NewlinesBefore;
1172  if (!InEscape)
1173  FormatTok->HasUnescapedNewline = true;
1174  else
1175  InEscape = false;
1176  FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
1177  Column = 0;
1178  break;
1179  case '\f':
1180  case '\v':
1181  Column = 0;
1182  break;
1183  case ' ':
1184  ++Column;
1185  break;
1186  case '\t':
1187  Column +=
1188  Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
1189  break;
1190  case '\\':
1191  case '?':
1192  case '/':
1193  // The text was entirely whitespace when this loop was entered. Thus
1194  // this has to be an escape sequence.
1195  assert(Text.substr(i, 2) == "\\\r" || Text.substr(i, 2) == "\\\n" ||
1196  Text.substr(i, 4) == "\?\?/\r" ||
1197  Text.substr(i, 4) == "\?\?/\n" ||
1198  (i >= 1 && (Text.substr(i - 1, 4) == "\?\?/\r" ||
1199  Text.substr(i - 1, 4) == "\?\?/\n")) ||
1200  (i >= 2 && (Text.substr(i - 2, 4) == "\?\?/\r" ||
1201  Text.substr(i - 2, 4) == "\?\?/\n")));
1202  InEscape = true;
1203  break;
1204  default:
1205  // This shouldn't happen.
1206  assert(false);
1207  break;
1208  }
1209  }
1210  WhitespaceLength += Text.size();
1211  readRawToken(*FormatTok);
1212  }
1213 
1214  if (FormatTok->is(tok::unknown))
1215  FormatTok->setType(TT_ImplicitStringLiteral);
1216 
1217  // JavaScript and Java do not allow to escape the end of the line with a
1218  // backslash. Backslashes are syntax errors in plain source, but can occur in
1219  // comments. When a single line comment ends with a \, it'll cause the next
1220  // line of code to be lexed as a comment, breaking formatting. The code below
1221  // finds comments that contain a backslash followed by a line break, truncates
1222  // the comment token at the backslash, and resets the lexer to restart behind
1223  // the backslash.
1224  if ((Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) &&
1225  FormatTok->is(tok::comment) && FormatTok->TokenText.starts_with("//")) {
1226  size_t BackslashPos = FormatTok->TokenText.find('\\');
1227  while (BackslashPos != StringRef::npos) {
1228  if (BackslashPos + 1 < FormatTok->TokenText.size() &&
1229  FormatTok->TokenText[BackslashPos + 1] == '\n') {
1230  truncateToken(BackslashPos + 1);
1231  break;
1232  }
1233  BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
1234  }
1235  }
1236 
1237  if (Style.isVerilog()) {
1238  static const llvm::Regex NumberBase("^s?[bdho]", llvm::Regex::IgnoreCase);
1239  SmallVector<StringRef, 1> Matches;
1240  // Verilog uses the backtick instead of the hash for preprocessor stuff.
1241  // And it uses the hash for delays and parameter lists. In order to continue
1242  // using `tok::hash` in other places, the backtick gets marked as the hash
1243  // here. And in order to tell the backtick and hash apart for
1244  // Verilog-specific stuff, the hash becomes an identifier.
1245  if (FormatTok->is(tok::numeric_constant)) {
1246  // In Verilog the quote is not part of a number.
1247  auto Quote = FormatTok->TokenText.find('\'');
1248  if (Quote != StringRef::npos)
1249  truncateToken(Quote);
1250  } else if (FormatTok->isOneOf(tok::hash, tok::hashhash)) {
1251  FormatTok->Tok.setKind(tok::raw_identifier);
1252  } else if (FormatTok->is(tok::raw_identifier)) {
1253  if (FormatTok->TokenText == "`") {
1254  FormatTok->Tok.setIdentifierInfo(nullptr);
1255  FormatTok->Tok.setKind(tok::hash);
1256  } else if (FormatTok->TokenText == "``") {
1257  FormatTok->Tok.setIdentifierInfo(nullptr);
1258  FormatTok->Tok.setKind(tok::hashhash);
1259  } else if (Tokens.size() > 0 &&
1260  Tokens.back()->is(Keywords.kw_apostrophe) &&
1261  NumberBase.match(FormatTok->TokenText, &Matches)) {
1262  // In Verilog in a based number literal like `'b10`, there may be
1263  // whitespace between `'b` and `10`. Therefore we handle the base and
1264  // the rest of the number literal as two tokens. But if there is no
1265  // space in the input code, we need to manually separate the two parts.
1266  truncateToken(Matches[0].size());
1267  FormatTok->setFinalizedType(TT_VerilogNumberBase);
1268  }
1269  }
1270  }
1271 
1272  FormatTok->WhitespaceRange = SourceRange(
1273  WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
1274 
1275  FormatTok->OriginalColumn = Column;
1276 
1277  TrailingWhitespace = 0;
1278  if (FormatTok->is(tok::comment)) {
1279  // FIXME: Add the trimmed whitespace to Column.
1280  StringRef UntrimmedText = FormatTok->TokenText;
1281  FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
1282  TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
1283  } else if (FormatTok->is(tok::raw_identifier)) {
1284  IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
1285  FormatTok->Tok.setIdentifierInfo(&Info);
1286  FormatTok->Tok.setKind(Info.getTokenID());
1287  if (Style.Language == FormatStyle::LK_Java &&
1288  FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
1289  tok::kw_operator)) {
1290  FormatTok->Tok.setKind(tok::identifier);
1291  FormatTok->Tok.setIdentifierInfo(nullptr);
1292  } else if (Style.isJavaScript() &&
1293  FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
1294  tok::kw_operator)) {
1295  FormatTok->Tok.setKind(tok::identifier);
1296  FormatTok->Tok.setIdentifierInfo(nullptr);
1297  } else if (Style.isTableGen() && !Keywords.isTableGenKeyword(*FormatTok)) {
1298  FormatTok->Tok.setKind(tok::identifier);
1299  FormatTok->Tok.setIdentifierInfo(nullptr);
1300  }
1301  } else if (FormatTok->is(tok::greatergreater)) {
1302  FormatTok->Tok.setKind(tok::greater);
1303  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1304  ++Column;
1305  StateStack.push(LexerState::TOKEN_STASHED);
1306  } else if (FormatTok->is(tok::lessless)) {
1307  FormatTok->Tok.setKind(tok::less);
1308  FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
1309  ++Column;
1310  StateStack.push(LexerState::TOKEN_STASHED);
1311  }
1312 
1313  if (Style.isVerilog() && Tokens.size() > 0 &&
1314  Tokens.back()->is(TT_VerilogNumberBase) &&
1315  FormatTok->Tok.isOneOf(tok::identifier, tok::question)) {
1316  // Mark the number following a base like `'h?a0` as a number.
1317  FormatTok->Tok.setKind(tok::numeric_constant);
1318  }
1319 
1320  // Now FormatTok is the next non-whitespace token.
1321 
1322  StringRef Text = FormatTok->TokenText;
1323  size_t FirstNewlinePos = Text.find('\n');
1324  if (FirstNewlinePos == StringRef::npos) {
1325  // FIXME: ColumnWidth actually depends on the start column, we need to
1326  // take this into account when the token is moved.
1327  FormatTok->ColumnWidth =
1328  encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
1329  Column += FormatTok->ColumnWidth;
1330  } else {
1331  FormatTok->IsMultiline = true;
1332  // FIXME: ColumnWidth actually depends on the start column, we need to
1333  // take this into account when the token is moved.
1335  Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1336 
1337  // The last line of the token always starts in column 0.
1338  // Thus, the length can be precomputed even in the presence of tabs.
1340  Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1341  Column = FormatTok->LastLineColumnWidth;
1342  }
1343 
1344  if (Style.isCpp()) {
1345  auto *Identifier = FormatTok->Tok.getIdentifierInfo();
1346  auto it = Macros.find(Identifier);
1347  if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1348  Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1349  tok::pp_define) &&
1350  it != Macros.end()) {
1351  FormatTok->setType(it->second);
1352  if (it->second == TT_IfMacro) {
1353  // The lexer token currently has type tok::kw_unknown. However, for this
1354  // substitution to be treated correctly in the TokenAnnotator, faking
1355  // the tok value seems to be needed. Not sure if there's a more elegant
1356  // way.
1357  FormatTok->Tok.setKind(tok::kw_if);
1358  }
1359  } else if (FormatTok->is(tok::identifier)) {
1360  if (MacroBlockBeginRegex.match(Text))
1361  FormatTok->setType(TT_MacroBlockBegin);
1362  else if (MacroBlockEndRegex.match(Text))
1363  FormatTok->setType(TT_MacroBlockEnd);
1364  else if (TypeNames.contains(Identifier))
1365  FormatTok->setFinalizedType(TT_TypeName);
1366  }
1367  }
1368 
1369  return FormatTok;
1370 }
1371 
1372 bool FormatTokenLexer::readRawTokenVerilogSpecific(Token &Tok) {
1373  // In Verilog the quote is not a character literal.
1374  //
1375  // Make the backtick and double backtick identifiers to match against them
1376  // more easily.
1377  //
1378  // In Verilog an escaped identifier starts with backslash and ends with
1379  // whitespace. Unless that whitespace is an escaped newline. A backslash can
1380  // also begin an escaped newline outside of an escaped identifier. We check
1381  // for that outside of the Regex since we can't use negative lookhead
1382  // assertions. Simply changing the '*' to '+' breaks stuff as the escaped
1383  // identifier may have a length of 0 according to Section A.9.3.
1384  // FIXME: If there is an escaped newline in the middle of an escaped
1385  // identifier, allow for pasting the two lines together, But escaped
1386  // identifiers usually occur only in generated code anyway.
1387  static const llvm::Regex VerilogToken(R"re(^('|``?|\\‍(\\)re"
1388  "(\r?\n|\r)|[^[:space:]])*)");
1389 
1390  SmallVector<StringRef, 4> Matches;
1391  const char *Start = Lex->getBufferLocation();
1392  if (!VerilogToken.match(StringRef(Start, Lex->getBuffer().end() - Start),
1393  &Matches)) {
1394  return false;
1395  }
1396  // There is a null byte at the end of the buffer, so we don't have to check
1397  // Start[1] is within the buffer.
1398  if (Start[0] == '\\' && (Start[1] == '\r' || Start[1] == '\n'))
1399  return false;
1400  size_t Len = Matches[0].size();
1401 
1402  // The kind has to be an identifier so we can match it against those defined
1403  // in Keywords. The kind has to be set before the length because the setLength
1404  // function checks that the kind is not an annotation.
1405  Tok.setKind(tok::raw_identifier);
1406  Tok.setLength(Len);
1407  Tok.setLocation(Lex->getSourceLocation(Start, Len));
1408  Tok.setRawIdentifierData(Start);
1409  Lex->seek(Lex->getCurrentBufferOffset() + Len, /*IsAtStartofline=*/false);
1410  return true;
1411 }
1412 
1413 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1414  // For Verilog, first see if there is a special token, and fall back to the
1415  // normal lexer if there isn't one.
1416  if (!Style.isVerilog() || !readRawTokenVerilogSpecific(Tok.Tok))
1417  Lex->LexFromRawLexer(Tok.Tok);
1418  Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1419  Tok.Tok.getLength());
1420  // For formatting, treat unterminated string literals like normal string
1421  // literals.
1422  if (Tok.is(tok::unknown)) {
1423  if (Tok.TokenText.starts_with("\"")) {
1424  Tok.Tok.setKind(tok::string_literal);
1425  Tok.IsUnterminatedLiteral = true;
1426  } else if (Style.isJavaScript() && Tok.TokenText == "''") {
1427  Tok.Tok.setKind(tok::string_literal);
1428  }
1429  }
1430 
1431  if ((Style.isJavaScript() || Style.isProto()) && Tok.is(tok::char_constant))
1432  Tok.Tok.setKind(tok::string_literal);
1433 
1434  if (Tok.is(tok::comment) && isClangFormatOn(Tok.TokenText))
1435  FormattingDisabled = false;
1436 
1437  Tok.Finalized = FormattingDisabled;
1438 
1439  if (Tok.is(tok::comment) && isClangFormatOff(Tok.TokenText))
1440  FormattingDisabled = true;
1441 }
1442 
1443 void FormatTokenLexer::resetLexer(unsigned Offset) {
1444  StringRef Buffer = SourceMgr.getBufferData(ID);
1445  Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID), LangOpts,
1446  Buffer.begin(), Buffer.begin() + Offset, Buffer.end()));
1447  Lex->SetKeepWhitespaceMode(true);
1448  TrailingWhitespace = 0;
1449 }
1450 
1451 } // namespace format
1452 } // namespace clang
MatchType Type
static char ID
Definition: Arena.cpp:183
This file contains FormatTokenLexer, which tokenizes a source file into a token stream suitable for C...
This file contains the declaration of the FormatToken, a wrapper around Token with additional informa...
StringRef Text
Definition: Format.cpp:2977
unsigned Offset
Definition: Format.cpp:2978
StringRef Identifier
Definition: Format.cpp:2984
Various functions to configurably format source code.
#define X(type, name)
Definition: Value.h:143
SourceLocation Loc
Definition: SemaObjC.cpp:755
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
SourceLocation End
SourceLocation Begin
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
Implements an efficient mapping from strings to IdentifierInfo nodes.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
unsigned getFileOffset(SourceLocation SpellingLoc) const
Returns the offset from the start of the file that the specified SourceLocation represents.
StringRef getBufferData(FileID FID, bool *Invalid=nullptr) const
Return a StringRef to the source buffer data for the specified FileID.
SourceLocation getLocForEndOfFile(FileID FID) const
Return the source location corresponding to the last byte of the specified file.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
llvm::MemoryBufferRef getBufferOrFake(FileID FID, SourceLocation Loc=SourceLocation()) const
Return the buffer for the specified FileID.
SourceLocation getLocForStartOfFile(FileID FID) const
Return the source location corresponding to the first byte of the specified file.
std::pair< FileID, unsigned > getDecomposedLoc(SourceLocation Loc) const
Decompose the specified location into a raw FileID + Offset pair.
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
void setLength(unsigned Len)
Definition: Token.h:141
void setKind(tok::TokenKind K)
Definition: Token.h:95
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
void setLocation(SourceLocation L)
Definition: Token.h:140
bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const
Definition: Token.h:101
void setIdentifierInfo(IdentifierInfo *II)
Definition: Token.h:196
FormatTokenLexer(const SourceManager &SourceMgr, FileID ID, unsigned Column, const FormatStyle &Style, encoding::Encoding Encoding, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
ArrayRef< FormatToken * > lex()
unsigned columnWidthWithTabs(StringRef Text, unsigned StartColumn, unsigned TabWidth, Encoding Encoding)
Returns the number of columns required to display the Text, starting from the StartColumn on a termin...
Definition: Encoding.h:60
static auto lexCSharpString(const char *Begin, const char *End, bool Verbatim, bool Interpolated)
static size_t countLeadingWhitespace(StringRef Text)
Count the length of leading whitespace in a token.
bool isClangFormatOff(StringRef Comment)
Definition: Format.cpp:4138
LangOptions getFormattingLangOpts(const FormatStyle &Style=getLLVMStyle())
Returns the LangOpts that the formatter expects you to set.
Definition: Format.cpp:3841
bool isClangFormatOn(StringRef Comment)
Definition: Format.cpp:4134
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:205
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
#define true
Definition: stdbool.h:25
bool isTableGenKeyword(const FormatToken &Tok) const
Definition: FormatToken.h:1899
bool isCSharpKeyword(const FormatToken &Tok) const
Returns true if Tok is a C# keyword, returns false if it is a anything else.
Definition: FormatToken.h:1697
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
bool isTableGen() const
Definition: Format.h:3188
@ LK_Java
Should be used for Java.
Definition: Format.h:3160
@ LK_TextProto
Should be used for Protocol Buffer messages in text format (https://developers.google....
Definition: Format.h:3174
std::vector< std::string > AttributeMacros
This option is renamed to BreakTemplateDeclarations.
Definition: Format.h:1174
std::string MacroBlockBegin
A regular expression matching macros that start a block.
Definition: Format.h:3237
LanguageKind Language
Language, this format style is targeted at.
Definition: Format.h:3192
unsigned TabWidth
The number of columns used for tab stops.
Definition: Format.h:4854
std::vector< std::string > StatementAttributeLikeMacros
Macros which are ignored in front of a statement, as if they were an attribute.
Definition: Format.h:4785
std::vector< std::string > IfMacros
A vector of macros that should be interpreted as conditionals instead of as function calls.
Definition: Format.h:2662
std::vector< std::string > ForEachMacros
A vector of macros that should be interpreted as foreach loops instead of as function calls.
Definition: Format.h:2639
std::vector< std::string > TypeNames
A vector of non-keyword identifiers that should be interpreted as type names.
Definition: Format.h:4864
bool isCSharp() const
Definition: Format.h:3181
std::vector< std::string > WhitespaceSensitiveMacros
A vector of macros which are whitespace-sensitive and should not be touched.
Definition: Format.h:4937
bool isProto() const
Definition: Format.h:3185
bool isVerilog() const
Definition: Format.h:3184
bool isJavaScript() const
Definition: Format.h:3183
std::vector< std::string > NamespaceMacros
A vector of macros which are used to open namespace blocks.
Definition: Format.h:3346
std::vector< std::string > StatementMacros
A vector of macros that should be interpreted as complete statements.
Definition: Format.h:4796
std::string MacroBlockEnd
A regular expression matching macros that end a block.
Definition: Format.h:3241
std::vector< std::string > TypenameMacros
A vector of macros that should be interpreted as type declarations instead of as function calls.
Definition: Format.h:4881
unsigned OriginalColumn
The original 0-based column of this token, including expanded tabs.
Definition: FormatToken.h:500
bool isNot(T Kind) const
Definition: FormatToken.h:621
StringRef TokenText
The raw text of the token.
Definition: FormatToken.h:310
unsigned LastNewlineOffset
The offset just past the last ' ' in this token's leading whitespace (relative to WhiteSpaceStart).
Definition: FormatToken.h:469
unsigned IsMultiline
Whether the token text contains newlines (escaped or not).
Definition: FormatToken.h:326
unsigned NewlinesBefore
The number of newlines immediately before the Token.
Definition: FormatToken.h:459
unsigned HasUnescapedNewline
Whether there is at least one unescaped newline before the Token.
Definition: FormatToken.h:323
unsigned ColumnWidth
The width of the non-whitespace parts of the token (or its first line for multi-line tokens) in colum...
Definition: FormatToken.h:474
void setType(TokenType T)
Definition: FormatToken.h:424
bool is(tok::TokenKind Kind) const
Definition: FormatToken.h:602
unsigned LastLineColumnWidth
Contains the width in columns of the last line of a multi-line token.
Definition: FormatToken.h:478
bool isOneOf(A K1, B K2) const
Definition: FormatToken.h:614
unsigned IsFirst
Indicates that this is the first token of the file.
Definition: FormatToken.h:329
SourceRange WhitespaceRange
The range of the whitespace immediately preceding the Token.
Definition: FormatToken.h:319
void setFinalizedType(TokenType T)
Sets the type and also the finalized flag.
Definition: FormatToken.h:438