clang  19.0.0git
UnwrappedLineParser.h
Go to the documentation of this file.
1 //===--- UnwrappedLineParser.h - Format C++ code ----------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains the declaration of the UnwrappedLineParser,
11 /// which turns a stream of tokens into UnwrappedLines.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
16 #define LLVM_CLANG_LIB_FORMAT_UNWRAPPEDLINEPARSER_H
17 
18 #include "Macros.h"
19 #include <stack>
20 
21 namespace clang {
22 namespace format {
23 
24 struct UnwrappedLineNode;
25 
26 /// An unwrapped line is a sequence of \c Token, that we would like to
27 /// put on a single line if there was no column limit.
28 ///
29 /// This is used as a main interface between the \c UnwrappedLineParser and the
30 /// \c UnwrappedLineFormatter. The key property is that changing the formatting
31 /// within an unwrapped line does not affect any other unwrapped lines.
32 struct UnwrappedLine {
33  UnwrappedLine() = default;
34 
35  /// The \c Tokens comprising this \c UnwrappedLine.
36  std::list<UnwrappedLineNode> Tokens;
37 
38  /// The indent level of the \c UnwrappedLine.
39  unsigned Level = 0;
40 
41  /// The \c PPBranchLevel (adjusted for header guards) if this line is a
42  /// \c InMacroBody line, and 0 otherwise.
43  unsigned PPLevel = 0;
44 
45  /// Whether this \c UnwrappedLine is part of a preprocessor directive.
46  bool InPPDirective = false;
47  /// Whether this \c UnwrappedLine is part of a pramga directive.
48  bool InPragmaDirective = false;
49  /// Whether it is part of a macro body.
50  bool InMacroBody = false;
51 
52  bool MustBeDeclaration = false;
53 
54  /// Whether the parser has seen \c decltype(auto) in this line.
55  bool SeenDecltypeAuto = false;
56 
57  /// \c True if this line should be indented by ContinuationIndent in
58  /// addition to the normal indention level.
59  bool IsContinuation = false;
60 
61  /// If this \c UnwrappedLine closes a block in a sequence of lines,
62  /// \c MatchingOpeningBlockLineIndex stores the index of the corresponding
63  /// opening line. Otherwise, \c MatchingOpeningBlockLineIndex must be
64  /// \c kInvalidIndex.
66 
67  /// If this \c UnwrappedLine opens a block, stores the index of the
68  /// line with the corresponding closing brace.
70 
71  static const size_t kInvalidIndex = -1;
72 
73  unsigned FirstStartColumn = 0;
74 };
75 
76 /// Interface for users of the UnwrappedLineParser to receive the parsed lines.
77 /// Parsing a single snippet of code can lead to multiple runs, where each
78 /// run is a coherent view of the file.
79 ///
80 /// For example, different runs are generated:
81 /// - for different combinations of #if blocks
82 /// - when macros are involved, for the expanded code and the as-written code
83 ///
84 /// Some tokens will only be visible in a subset of the runs.
85 /// For each run, \c UnwrappedLineParser will call \c consumeUnwrappedLine
86 /// for each parsed unwrapped line, and then \c finishRun to indicate
87 /// that the set of unwrapped lines before is one coherent view of the
88 /// code snippet to be formatted.
90 public:
92  virtual void consumeUnwrappedLine(const UnwrappedLine &Line) = 0;
93  virtual void finishRun() = 0;
94 };
95 
96 class FormatTokenSource;
97 
99 public:
100  UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style,
101  const AdditionalKeywords &Keywords,
102  unsigned FirstStartColumn, ArrayRef<FormatToken *> Tokens,
103  UnwrappedLineConsumer &Callback,
104  llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
105  IdentifierTable &IdentTable);
106 
107  void parse();
108 
109 private:
110  enum class IfStmtKind {
111  NotIf, // Not an if statement.
112  IfOnly, // An if statement without the else clause.
113  IfElse, // An if statement followed by else but not else if.
114  IfElseIf // An if statement followed by else if.
115  };
116 
117  void reset();
118  void parseFile();
119  bool precededByCommentOrPPDirective() const;
120  bool parseLevel(const FormatToken *OpeningBrace = nullptr,
121  IfStmtKind *IfKind = nullptr,
122  FormatToken **IfLeftBrace = nullptr);
123  bool mightFitOnOneLine(UnwrappedLine &Line,
124  const FormatToken *OpeningBrace = nullptr) const;
125  FormatToken *parseBlock(bool MustBeDeclaration = false,
126  unsigned AddLevels = 1u, bool MunchSemi = true,
127  bool KeepBraces = true, IfStmtKind *IfKind = nullptr,
128  bool UnindentWhitesmithsBraces = false);
129  void parseChildBlock();
130  void parsePPDirective();
131  void parsePPDefine();
132  void parsePPIf(bool IfDef);
133  void parsePPElse();
134  void parsePPEndIf();
135  void parsePPPragma();
136  void parsePPUnknown();
137  void readTokenWithJavaScriptASI();
138  void parseStructuralElement(const FormatToken *OpeningBrace = nullptr,
139  IfStmtKind *IfKind = nullptr,
140  FormatToken **IfLeftBrace = nullptr,
141  bool *HasDoWhile = nullptr,
142  bool *HasLabel = nullptr);
143  bool tryToParseBracedList();
144  bool parseBracedList(bool IsAngleBracket = false, bool IsEnum = false);
145  bool parseParens(TokenType AmpAmpTokenType = TT_Unknown);
146  void parseSquare(bool LambdaIntroducer = false);
147  void keepAncestorBraces();
148  void parseUnbracedBody(bool CheckEOF = false);
149  void handleAttributes();
150  bool handleCppAttributes();
151  bool isBlockBegin(const FormatToken &Tok) const;
152  FormatToken *parseIfThenElse(IfStmtKind *IfKind, bool KeepBraces = false,
153  bool IsVerilogAssert = false);
154  void parseTryCatch();
155  void parseLoopBody(bool KeepBraces, bool WrapRightBrace);
156  void parseForOrWhileLoop(bool HasParens = true);
157  void parseDoWhile();
158  void parseLabel(bool LeftAlignLabel = false);
159  void parseCaseLabel();
160  void parseSwitch(bool IsExpr);
161  void parseNamespace();
162  bool parseModuleImport();
163  void parseNew();
164  void parseAccessSpecifier();
165  bool parseEnum();
166  bool parseStructLike();
167  bool parseRequires();
168  void parseRequiresClause(FormatToken *RequiresToken);
169  void parseRequiresExpression(FormatToken *RequiresToken);
170  void parseConstraintExpression();
171  void parseJavaEnumBody();
172  // Parses a record (aka class) as a top level element. If ParseAsExpr is true,
173  // parses the record as a child block, i.e. if the class declaration is an
174  // expression.
175  void parseRecord(bool ParseAsExpr = false);
176  void parseObjCLightweightGenerics();
177  void parseObjCMethod();
178  void parseObjCProtocolList();
179  void parseObjCUntilAtEnd();
180  void parseObjCInterfaceOrImplementation();
181  bool parseObjCProtocol();
182  void parseJavaScriptEs6ImportExport();
183  void parseStatementMacro();
184  void parseCSharpAttribute();
185  // Parse a C# generic type constraint: `where T : IComparable<T>`.
186  // See:
187  // https://docs.microsoft.com/en-us/dotnet/csharp/language-reference/keywords/where-generic-type-constraint
188  void parseCSharpGenericTypeConstraint();
189  bool tryToParseLambda();
190  bool tryToParseChildBlock();
191  bool tryToParseLambdaIntroducer();
192  bool tryToParsePropertyAccessor();
193  void tryToParseJSFunction();
194  bool tryToParseSimpleAttribute();
195  void parseVerilogHierarchyIdentifier();
196  void parseVerilogSensitivityList();
197  // Returns the number of levels of indentation in addition to the normal 1
198  // level for a block, used for indenting case labels.
199  unsigned parseVerilogHierarchyHeader();
200  void parseVerilogTable();
201  void parseVerilogCaseLabel();
202  std::optional<llvm::SmallVector<llvm::SmallVector<FormatToken *, 8>, 1>>
203  parseMacroCall();
204 
205  // Used by addUnwrappedLine to denote whether to keep or remove a level
206  // when resetting the line state.
207  enum class LineLevel { Remove, Keep };
208 
209  void addUnwrappedLine(LineLevel AdjustLevel = LineLevel::Remove);
210  bool eof() const;
211  // LevelDifference is the difference of levels after and before the current
212  // token. For example:
213  // - if the token is '{' and opens a block, LevelDifference is 1.
214  // - if the token is '}' and closes a block, LevelDifference is -1.
215  void nextToken(int LevelDifference = 0);
216  void readToken(int LevelDifference = 0);
217 
218  // Decides which comment tokens should be added to the current line and which
219  // should be added as comments before the next token.
220  //
221  // Comments specifies the sequence of comment tokens to analyze. They get
222  // either pushed to the current line or added to the comments before the next
223  // token.
224  //
225  // NextTok specifies the next token. A null pointer NextTok is supported, and
226  // signifies either the absence of a next token, or that the next token
227  // shouldn't be taken into account for the analysis.
228  void distributeComments(const SmallVectorImpl<FormatToken *> &Comments,
229  const FormatToken *NextTok);
230 
231  // Adds the comment preceding the next token to unwrapped lines.
232  void flushComments(bool NewlineBeforeNext);
233  void pushToken(FormatToken *Tok);
234  void calculateBraceTypes(bool ExpectClassBody = false);
235  void setPreviousRBraceType(TokenType Type);
236 
237  // Marks a conditional compilation edge (for example, an '#if', '#ifdef',
238  // '#else' or merge conflict marker). If 'Unreachable' is true, assumes
239  // this branch either cannot be taken (for example '#if false'), or should
240  // not be taken in this round.
241  void conditionalCompilationCondition(bool Unreachable);
242  void conditionalCompilationStart(bool Unreachable);
243  void conditionalCompilationAlternative();
244  void conditionalCompilationEnd();
245 
246  bool isOnNewLine(const FormatToken &FormatTok);
247 
248  // Returns whether there is a macro expansion in the line, i.e. a token that
249  // was expanded from a macro call.
250  bool containsExpansion(const UnwrappedLine &Line) const;
251 
252  // Compute hash of the current preprocessor branch.
253  // This is used to identify the different branches, and thus track if block
254  // open and close in the same branch.
255  size_t computePPHash() const;
256 
257  bool parsingPPDirective() const { return CurrentLines != &Lines; }
258 
259  // FIXME: We are constantly running into bugs where Line.Level is incorrectly
260  // subtracted from beyond 0. Introduce a method to subtract from Line.Level
261  // and use that everywhere in the Parser.
262  std::unique_ptr<UnwrappedLine> Line;
263 
264  // Lines that are created by macro expansion.
265  // When formatting code containing macro calls, we first format the expanded
266  // lines to set the token types correctly. Afterwards, we format the
267  // reconstructed macro calls, re-using the token types determined in the first
268  // step.
269  // ExpandedLines will be reset every time we create a new LineAndExpansion
270  // instance once a line containing macro calls has been parsed.
271  SmallVector<UnwrappedLine, 8> CurrentExpandedLines;
272 
273  // Maps from the first token of a top-level UnwrappedLine that contains
274  // a macro call to the replacement UnwrappedLines expanded from the macro
275  // call.
276  llvm::DenseMap<FormatToken *, SmallVector<UnwrappedLine, 8>> ExpandedLines;
277 
278  // Map from the macro identifier to a line containing the full unexpanded
279  // macro call.
280  llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> Unexpanded;
281 
282  // For recursive macro expansions, trigger reconstruction only on the
283  // outermost expansion.
284  bool InExpansion = false;
285 
286  // Set while we reconstruct a macro call.
287  // For reconstruction, we feed the expanded lines into the reconstructor
288  // until it is finished.
289  std::optional<MacroCallReconstructor> Reconstruct;
290 
291  // Comments are sorted into unwrapped lines by whether they are in the same
292  // line as the previous token, or not. If not, they belong to the next token.
293  // Since the next token might already be in a new unwrapped line, we need to
294  // store the comments belonging to that token.
295  SmallVector<FormatToken *, 1> CommentsBeforeNextToken;
296  FormatToken *FormatTok = nullptr;
297  bool MustBreakBeforeNextToken;
298 
299  // The parsed lines. Only added to through \c CurrentLines.
301 
302  // Preprocessor directives are parsed out-of-order from other unwrapped lines.
303  // Thus, we need to keep a list of preprocessor directives to be reported
304  // after an unwrapped line that has been started was finished.
305  SmallVector<UnwrappedLine, 4> PreprocessorDirectives;
306 
307  // New unwrapped lines are added via CurrentLines.
308  // Usually points to \c &Lines. While parsing a preprocessor directive when
309  // there is an unfinished previous unwrapped line, will point to
310  // \c &PreprocessorDirectives.
311  SmallVectorImpl<UnwrappedLine> *CurrentLines;
312 
313  // We store for each line whether it must be a declaration depending on
314  // whether we are in a compound statement or not.
315  llvm::BitVector DeclarationScopeStack;
316 
317  const FormatStyle &Style;
318  bool IsCpp;
319  LangOptions LangOpts;
320  const AdditionalKeywords &Keywords;
321 
322  llvm::Regex CommentPragmasRegex;
323 
324  FormatTokenSource *Tokens;
325  UnwrappedLineConsumer &Callback;
326 
327  ArrayRef<FormatToken *> AllTokens;
328 
329  // Keeps a stack of the states of nested control statements (true if the
330  // statement contains more than some predefined number of nested statements).
331  SmallVector<bool, 8> NestedTooDeep;
332 
333  // Keeps a stack of the states of nested lambdas (true if the return type of
334  // the lambda is `decltype(auto)`).
335  SmallVector<bool, 4> NestedLambdas;
336 
337  // Whether the parser is parsing the body of a function whose return type is
338  // `decltype(auto)`.
339  bool IsDecltypeAutoFunction = false;
340 
341  // Represents preprocessor branch type, so we can find matching
342  // #if/#else/#endif directives.
343  enum PPBranchKind {
344  PP_Conditional, // Any #if, #ifdef, #ifndef, #elif, block outside #if 0
345  PP_Unreachable // #if 0 or a conditional preprocessor block inside #if 0
346  };
347 
348  struct PPBranch {
349  PPBranch(PPBranchKind Kind, size_t Line) : Kind(Kind), Line(Line) {}
350  PPBranchKind Kind;
351  size_t Line;
352  };
353 
354  // Keeps a stack of currently active preprocessor branching directives.
356 
357  // The \c UnwrappedLineParser re-parses the code for each combination
358  // of preprocessor branches that can be taken.
359  // To that end, we take the same branch (#if, #else, or one of the #elif
360  // branches) for each nesting level of preprocessor branches.
361  // \c PPBranchLevel stores the current nesting level of preprocessor
362  // branches during one pass over the code.
363  int PPBranchLevel;
364 
365  // Contains the current branch (#if, #else or one of the #elif branches)
366  // for each nesting level.
367  SmallVector<int, 8> PPLevelBranchIndex;
368 
369  // Contains the maximum number of branches at each nesting level.
370  SmallVector<int, 8> PPLevelBranchCount;
371 
372  // Contains the number of branches per nesting level we are currently
373  // in while parsing a preprocessor branch sequence.
374  // This is used to update PPLevelBranchCount at the end of a branch
375  // sequence.
376  std::stack<int> PPChainBranchIndex;
377 
378  // Include guard search state. Used to fixup preprocessor indent levels
379  // so that include guards do not participate in indentation.
380  enum IncludeGuardState {
381  IG_Inited, // Search started, looking for #ifndef.
382  IG_IfNdefed, // #ifndef found, IncludeGuardToken points to condition.
383  IG_Defined, // Matching #define found, checking other requirements.
384  IG_Found, // All requirements met, need to fix indents.
385  IG_Rejected, // Search failed or never started.
386  };
387 
388  // Current state of include guard search.
389  IncludeGuardState IncludeGuard;
390 
391  // Points to the #ifndef condition for a potential include guard. Null unless
392  // IncludeGuardState == IG_IfNdefed.
393  FormatToken *IncludeGuardToken;
394 
395  // Contains the first start column where the source begins. This is zero for
396  // normal source code and may be nonzero when formatting a code fragment that
397  // does not start at the beginning of the file.
398  unsigned FirstStartColumn;
399 
400  MacroExpander Macros;
401 
402  friend class ScopedLineState;
404 };
405 
407  UnwrappedLineNode() : Tok(nullptr) {}
410  : Tok(Tok), Children(Children.begin(), Children.end()) {}
411 
414 };
415 
416 std::ostream &operator<<(std::ostream &Stream, const UnwrappedLine &Line);
417 
418 } // end namespace format
419 } // end namespace clang
420 
421 #endif
This file contains the main building blocks of macro support in clang-format.
Implements an efficient mapping from strings to IdentifierInfo nodes.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:482
This class handles loading and caching of source files into memory.
The base class of the type hierarchy.
Definition: Type.h:1813
Takes a set of macro definitions as strings and allows expanding calls to those macros.
Definition: Macros.h:80
Interface for users of the UnwrappedLineParser to receive the parsed lines.
virtual void consumeUnwrappedLine(const UnwrappedLine &Line)=0
UnwrappedLineParser(SourceManager &SourceMgr, const FormatStyle &Style, const AdditionalKeywords &Keywords, unsigned FirstStartColumn, ArrayRef< FormatToken * > Tokens, UnwrappedLineConsumer &Callback, llvm::SpecificBumpPtrAllocator< FormatToken > &Allocator, IdentifierTable &IdentTable)
std::ostream & operator<<(std::ostream &Stream, const UnwrappedLine &Line)
TokenType
Determines the semantic type of a syntactic token, e.g.
Definition: FormatToken.h:205
The JSON file list parser is used to communicate input to InstallAPI.
Represents a complete lambda introducer.
Definition: DeclSpec.h:2832
Encapsulates keywords that are context sensitive or for languages not properly supported by Clang's l...
Definition: FormatToken.h:995
The FormatStyle is used to configure the formatting to follow specific guidelines.
Definition: Format.h:55
A wrapper around a Token storing information about the whitespace characters preceding it.
Definition: FormatToken.h:290
SmallVector< UnwrappedLine, 0 > Children
UnwrappedLineNode(FormatToken *Tok, llvm::ArrayRef< UnwrappedLine > Children={})
An unwrapped line is a sequence of Token, that we would like to put on a single line if there was no ...
unsigned PPLevel
The PPBranchLevel (adjusted for header guards) if this line is a InMacroBody line,...
bool InMacroBody
Whether it is part of a macro body.
std::list< UnwrappedLineNode > Tokens
The Tokens comprising this UnwrappedLine.
bool IsContinuation
True if this line should be indented by ContinuationIndent in addition to the normal indention level.
unsigned Level
The indent level of the UnwrappedLine.
bool InPragmaDirective
Whether this UnwrappedLine is part of a pramga directive.
bool InPPDirective
Whether this UnwrappedLine is part of a preprocessor directive.
bool SeenDecltypeAuto
Whether the parser has seen decltype(auto) in this line.
size_t MatchingClosingBlockLineIndex
If this UnwrappedLine opens a block, stores the index of the line with the corresponding closing brac...
static const size_t kInvalidIndex
size_t MatchingOpeningBlockLineIndex
If this UnwrappedLine closes a block in a sequence of lines, MatchingOpeningBlockLineIndex stores the...