clang  20.0.0git
CommentLexer.h
Go to the documentation of this file.
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines lexer for structured comments and supporting token class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14 #define LLVM_CLANG_AST_COMMENTLEXER_H
15 
16 #include "clang/Basic/Diagnostic.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Allocator.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 namespace clang {
24 namespace comments {
25 
26 class Lexer;
27 class TextTokenRetokenizer;
28 struct CommandInfo;
29 class CommandTraits;
30 
31 namespace tok {
32 enum TokenKind {
33  eof,
36  unknown_command, // Command that does not have an ID.
37  backslash_command, // Command with an ID, that used backslash marker.
38  at_command, // Command with an ID, that used 'at' marker.
44  html_start_tag, // <tag
45  html_ident, // attr
47  html_quoted_string, // "blah\"blah" or 'blah\'blah'
50  html_end_tag // </tag
51 };
52 } // end namespace tok
53 
54 /// Comment token.
55 class Token {
56  friend class Lexer;
57  friend class TextTokenRetokenizer;
58 
59  /// The location of the token.
61 
62  /// The actual kind of the token.
64 
65  /// Integer value associated with a token.
66  ///
67  /// If the token is a known command, contains command ID and TextPtr is
68  /// unused (command spelling can be found with CommandTraits). Otherwise,
69  /// contains the length of the string that starts at TextPtr.
70  unsigned IntVal;
71 
72  /// Length of the token spelling in comment. Can be 0 for synthenized
73  /// tokens.
74  unsigned Length;
75 
76  /// Contains text value associated with a token.
77  const char *TextPtr;
78 
79 public:
80  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
81  void setLocation(SourceLocation SL) { Loc = SL; }
82 
83  SourceLocation getEndLocation() const LLVM_READONLY {
84  if (Length == 0 || Length == 1)
85  return Loc;
86  return Loc.getLocWithOffset(Length - 1);
87  }
88 
89  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
90  void setKind(tok::TokenKind K) { Kind = K; }
91 
92  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
93  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94 
95  unsigned getLength() const LLVM_READONLY { return Length; }
96  void setLength(unsigned L) { Length = L; }
97 
98  StringRef getText() const LLVM_READONLY {
99  assert(is(tok::text));
100  return StringRef(TextPtr, IntVal);
101  }
102 
103  void setText(StringRef Text) {
104  assert(is(tok::text));
105  TextPtr = Text.data();
106  IntVal = Text.size();
107  }
108 
109  StringRef getUnknownCommandName() const LLVM_READONLY {
110  assert(is(tok::unknown_command));
111  return StringRef(TextPtr, IntVal);
112  }
113 
114  void setUnknownCommandName(StringRef Name) {
115  assert(is(tok::unknown_command));
116  TextPtr = Name.data();
117  IntVal = Name.size();
118  }
119 
120  unsigned getCommandID() const LLVM_READONLY {
122  return IntVal;
123  }
124 
125  void setCommandID(unsigned ID) {
127  IntVal = ID;
128  }
129 
130  unsigned getVerbatimBlockID() const LLVM_READONLY {
132  return IntVal;
133  }
134 
135  void setVerbatimBlockID(unsigned ID) {
137  IntVal = ID;
138  }
139 
140  StringRef getVerbatimBlockText() const LLVM_READONLY {
141  assert(is(tok::verbatim_block_line));
142  return StringRef(TextPtr, IntVal);
143  }
144 
145  void setVerbatimBlockText(StringRef Text) {
146  assert(is(tok::verbatim_block_line));
147  TextPtr = Text.data();
148  IntVal = Text.size();
149  }
150 
151  unsigned getVerbatimLineID() const LLVM_READONLY {
152  assert(is(tok::verbatim_line_name));
153  return IntVal;
154  }
155 
156  void setVerbatimLineID(unsigned ID) {
157  assert(is(tok::verbatim_line_name));
158  IntVal = ID;
159  }
160 
161  StringRef getVerbatimLineText() const LLVM_READONLY {
162  assert(is(tok::verbatim_line_text));
163  return StringRef(TextPtr, IntVal);
164  }
165 
166  void setVerbatimLineText(StringRef Text) {
167  assert(is(tok::verbatim_line_text));
168  TextPtr = Text.data();
169  IntVal = Text.size();
170  }
171 
172  StringRef getHTMLTagStartName() const LLVM_READONLY {
173  assert(is(tok::html_start_tag));
174  return StringRef(TextPtr, IntVal);
175  }
176 
177  void setHTMLTagStartName(StringRef Name) {
178  assert(is(tok::html_start_tag));
179  TextPtr = Name.data();
180  IntVal = Name.size();
181  }
182 
183  StringRef getHTMLIdent() const LLVM_READONLY {
184  assert(is(tok::html_ident));
185  return StringRef(TextPtr, IntVal);
186  }
187 
188  void setHTMLIdent(StringRef Name) {
189  assert(is(tok::html_ident));
190  TextPtr = Name.data();
191  IntVal = Name.size();
192  }
193 
194  StringRef getHTMLQuotedString() const LLVM_READONLY {
195  assert(is(tok::html_quoted_string));
196  return StringRef(TextPtr, IntVal);
197  }
198 
199  void setHTMLQuotedString(StringRef Str) {
200  assert(is(tok::html_quoted_string));
201  TextPtr = Str.data();
202  IntVal = Str.size();
203  }
204 
205  StringRef getHTMLTagEndName() const LLVM_READONLY {
206  assert(is(tok::html_end_tag));
207  return StringRef(TextPtr, IntVal);
208  }
209 
210  void setHTMLTagEndName(StringRef Name) {
211  assert(is(tok::html_end_tag));
212  TextPtr = Name.data();
213  IntVal = Name.size();
214  }
215 
216  void dump(const Lexer &L, const SourceManager &SM) const;
217 };
218 
219 /// Comment lexer.
220 class Lexer {
221 private:
222  Lexer(const Lexer &) = delete;
223  void operator=(const Lexer &) = delete;
224 
225  /// Allocator for strings that are semantic values of tokens and have to be
226  /// computed (for example, resolved decimal character references).
227  llvm::BumpPtrAllocator &Allocator;
228 
229  DiagnosticsEngine &Diags;
230 
231  const CommandTraits &Traits;
232 
233  const char *const BufferStart;
234  const char *const BufferEnd;
235 
236  const char *BufferPtr;
237 
238  /// One past end pointer for the current comment. For BCPL comments points
239  /// to newline or BufferEnd, for C comments points to star in '*/'.
240  const char *CommentEnd;
241 
242  SourceLocation FileLoc;
243 
244  /// If true, the commands, html tags, etc will be parsed and reported as
245  /// separate tokens inside the comment body. If false, the comment text will
246  /// be parsed into text and newline tokens.
247  bool ParseCommands;
248 
249  enum LexerCommentState : uint8_t {
250  LCS_BeforeComment,
251  LCS_InsideBCPLComment,
252  LCS_InsideCComment,
253  LCS_BetweenComments
254  };
255 
256  /// Low-level lexer state, track if we are inside or outside of comment.
257  LexerCommentState CommentState;
258 
259  enum LexerState : uint8_t {
260  /// Lexing normal comment text
261  LS_Normal,
262 
263  /// Finished lexing verbatim block beginning command, will lex first body
264  /// line.
265  LS_VerbatimBlockFirstLine,
266 
267  /// Lexing verbatim block body line-by-line, skipping line-starting
268  /// decorations.
269  LS_VerbatimBlockBody,
270 
271  /// Finished lexing verbatim line beginning command, will lex text (one
272  /// line).
273  LS_VerbatimLineText,
274 
275  /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
276  LS_HTMLStartTag,
277 
278  /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
279  LS_HTMLEndTag
280  };
281 
282  /// Current lexing mode.
283  LexerState State;
284 
285  /// If State is LS_VerbatimBlock, contains the name of verbatim end
286  /// command, including command marker.
287  SmallString<16> VerbatimBlockEndCommandName;
288 
289  /// Given a character reference name (e.g., "lt"), return the character that
290  /// it stands for (e.g., "<").
291  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
292 
293  /// Given a Unicode codepoint as base-10 integer, return the character.
294  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
295 
296  /// Given a Unicode codepoint as base-16 integer, return the character.
297  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
298 
299  void formTokenWithChars(Token &Result, const char *TokEnd,
301 
302  void formTextToken(Token &Result, const char *TokEnd) {
303  StringRef Text(BufferPtr, TokEnd - BufferPtr);
304  formTokenWithChars(Result, TokEnd, tok::text);
305  Result.setText(Text);
306  }
307 
308  SourceLocation getSourceLocation(const char *Loc) const {
309  assert(Loc >= BufferStart && Loc <= BufferEnd &&
310  "Location out of range for this buffer!");
311 
312  const unsigned CharNo = Loc - BufferStart;
313  return FileLoc.getLocWithOffset(CharNo);
314  }
315 
316  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
317  return Diags.Report(Loc, DiagID);
318  }
319 
320  /// Eat string matching regexp \code \s*\* \endcode.
321  void skipLineStartingDecorations();
322 
323  /// Skip over pure text.
324  const char *skipTextToken();
325 
326  /// Lex comment text, including commands if ParseCommands is set to true.
327  void lexCommentText(Token &T);
328 
329  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
330  const CommandInfo *Info);
331 
332  void lexVerbatimBlockFirstLine(Token &T);
333 
334  void lexVerbatimBlockBody(Token &T);
335 
336  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
337  const CommandInfo *Info);
338 
339  void lexVerbatimLineText(Token &T);
340 
341  void lexHTMLCharacterReference(Token &T);
342 
343  void setupAndLexHTMLStartTag(Token &T);
344 
345  void lexHTMLStartTag(Token &T);
346 
347  void setupAndLexHTMLEndTag(Token &T);
348 
349  void lexHTMLEndTag(Token &T);
350 
351 public:
352  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
353  const CommandTraits &Traits, SourceLocation FileLoc,
354  const char *BufferStart, const char *BufferEnd,
355  bool ParseCommands = true);
356 
357  void lex(Token &T);
358 
359  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
360 };
361 
362 } // end namespace comments
363 } // end namespace clang
364 
365 #endif
366 
static char ID
Definition: Arena.cpp:183
#define SM(sm)
Definition: Cuda.cpp:83
Defines the Diagnostic-related interfaces.
enum clang::sema::@1659::IndirectLocalPathEntry::EntryKind Kind
StringRef Text
Definition: Format.cpp:3002
SourceLocation Loc
Definition: SemaObjC.cpp:759
Defines the SourceManager interface.
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1277
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:193
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1553
Encodes a location in the source.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
This class handles loading and caching of source files into memory.
This class provides information about commands that can be used in comments.
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const
Re-lexes a sequence of tok::text tokens.
Comment token.
Definition: CommentLexer.h:55
StringRef getHTMLQuotedString() const LLVM_READONLY
Definition: CommentLexer.h:194
bool isNot(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:93
SourceLocation getEndLocation() const LLVM_READONLY
Definition: CommentLexer.h:83
void setLocation(SourceLocation SL)
Definition: CommentLexer.h:81
void setVerbatimBlockText(StringRef Text)
Definition: CommentLexer.h:145
void setHTMLTagEndName(StringRef Name)
Definition: CommentLexer.h:210
unsigned getCommandID() const LLVM_READONLY
Definition: CommentLexer.h:120
void setVerbatimLineID(unsigned ID)
Definition: CommentLexer.h:156
void setHTMLTagStartName(StringRef Name)
Definition: CommentLexer.h:177
StringRef getUnknownCommandName() const LLVM_READONLY
Definition: CommentLexer.h:109
StringRef getText() const LLVM_READONLY
Definition: CommentLexer.h:98
void dump(const Lexer &L, const SourceManager &SM) const
StringRef getHTMLIdent() const LLVM_READONLY
Definition: CommentLexer.h:183
void setHTMLIdent(StringRef Name)
Definition: CommentLexer.h:188
StringRef getVerbatimBlockText() const LLVM_READONLY
Definition: CommentLexer.h:140
unsigned getVerbatimLineID() const LLVM_READONLY
Definition: CommentLexer.h:151
void setVerbatimLineText(StringRef Text)
Definition: CommentLexer.h:166
unsigned getVerbatimBlockID() const LLVM_READONLY
Definition: CommentLexer.h:130
void setLength(unsigned L)
Definition: CommentLexer.h:96
void setText(StringRef Text)
Definition: CommentLexer.h:103
bool is(tok::TokenKind K) const LLVM_READONLY
Definition: CommentLexer.h:92
void setUnknownCommandName(StringRef Name)
Definition: CommentLexer.h:114
void setCommandID(unsigned ID)
Definition: CommentLexer.h:125
void setHTMLQuotedString(StringRef Str)
Definition: CommentLexer.h:199
unsigned getLength() const LLVM_READONLY
Definition: CommentLexer.h:95
SourceLocation getLocation() const LLVM_READONLY
Definition: CommentLexer.h:80
StringRef getVerbatimLineText() const LLVM_READONLY
Definition: CommentLexer.h:161
tok::TokenKind getKind() const LLVM_READONLY
Definition: CommentLexer.h:89
void setVerbatimBlockID(unsigned ID)
Definition: CommentLexer.h:135
void setKind(tok::TokenKind K)
Definition: CommentLexer.h:90
StringRef getHTMLTagStartName() const LLVM_READONLY
Definition: CommentLexer.h:172
StringRef getHTMLTagEndName() const LLVM_READONLY
Definition: CommentLexer.h:205
The JSON file list parser is used to communicate input to InstallAPI.
const FunctionProtoType * T
Information about a single command.