clang  19.0.0git
Lexer.cpp
Go to the documentation of this file.
1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the Lexer and Token interfaces.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "clang/Lex/Lexer.h"
14 #include "UnicodeCharSets.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/Diagnostic.h"
18 #include "clang/Basic/LLVM.h"
22 #include "clang/Basic/TokenKinds.h"
26 #include "clang/Lex/Preprocessor.h"
28 #include "clang/Lex/Token.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/StringExtras.h"
31 #include "llvm/ADT/StringRef.h"
32 #include "llvm/ADT/StringSwitch.h"
33 #include "llvm/Support/Compiler.h"
34 #include "llvm/Support/ConvertUTF.h"
35 #include "llvm/Support/MathExtras.h"
36 #include "llvm/Support/MemoryBufferRef.h"
37 #include "llvm/Support/NativeFormatting.h"
38 #include "llvm/Support/Unicode.h"
39 #include "llvm/Support/UnicodeCharRanges.h"
40 #include <algorithm>
41 #include <cassert>
42 #include <cstddef>
43 #include <cstdint>
44 #include <cstring>
45 #include <optional>
46 #include <string>
47 #include <tuple>
48 #include <utility>
49 
50 #ifdef __SSE4_2__
51 #include <nmmintrin.h>
52 #endif
53 
54 using namespace clang;
55 
56 //===----------------------------------------------------------------------===//
57 // Token Class Implementation
58 //===----------------------------------------------------------------------===//
59 
60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
62  if (isAnnotation())
63  return false;
64  if (const IdentifierInfo *II = getIdentifierInfo())
65  return II->getObjCKeywordID() == objcKey;
66  return false;
67 }
68 
69 /// getObjCKeywordID - Return the ObjC keyword kind.
71  if (isAnnotation())
72  return tok::objc_not_keyword;
73  const IdentifierInfo *specId = getIdentifierInfo();
74  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
75 }
76 
77 /// Determine whether the token kind starts a simple-type-specifier.
78 bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
79  switch (getKind()) {
80  case tok::annot_typename:
81  case tok::annot_decltype:
82  case tok::annot_pack_indexing_type:
83  return true;
84 
85  case tok::kw_short:
86  case tok::kw_long:
87  case tok::kw___int64:
88  case tok::kw___int128:
89  case tok::kw_signed:
90  case tok::kw_unsigned:
91  case tok::kw_void:
92  case tok::kw_char:
93  case tok::kw_int:
94  case tok::kw_half:
95  case tok::kw_float:
96  case tok::kw_double:
97  case tok::kw___bf16:
98  case tok::kw__Float16:
99  case tok::kw___float128:
100  case tok::kw___ibm128:
101  case tok::kw_wchar_t:
102  case tok::kw_bool:
103  case tok::kw__Bool:
104  case tok::kw__Accum:
105  case tok::kw__Fract:
106  case tok::kw__Sat:
107 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
108 #include "clang/Basic/TransformTypeTraits.def"
109  case tok::kw___auto_type:
110  case tok::kw_char16_t:
111  case tok::kw_char32_t:
112  case tok::kw_typeof:
113  case tok::kw_decltype:
114  case tok::kw_char8_t:
115  return getIdentifierInfo()->isKeyword(LangOpts);
116 
117  default:
118  return false;
119  }
120 }
121 
122 //===----------------------------------------------------------------------===//
123 // Lexer Class Implementation
124 //===----------------------------------------------------------------------===//
125 
126 void Lexer::anchor() {}
127 
128 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
129  const char *BufEnd) {
130  BufferStart = BufStart;
131  BufferPtr = BufPtr;
132  BufferEnd = BufEnd;
133 
134  assert(BufEnd[0] == 0 &&
135  "We assume that the input buffer has a null character at the end"
136  " to simplify lexing!");
137 
138  // Check whether we have a BOM in the beginning of the buffer. If yes - act
139  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
140  // skip the UTF-8 BOM if it's present.
141  if (BufferStart == BufferPtr) {
142  // Determine the size of the BOM.
143  StringRef Buf(BufferStart, BufferEnd - BufferStart);
144  size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
145  .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
146  .Default(0);
147 
148  // Skip the BOM.
149  BufferPtr += BOMLength;
150  }
151 
152  Is_PragmaLexer = false;
153  CurrentConflictMarkerState = CMK_None;
154 
155  // Start of the file is a start of line.
156  IsAtStartOfLine = true;
157  IsAtPhysicalStartOfLine = true;
158 
159  HasLeadingSpace = false;
160  HasLeadingEmptyMacro = false;
161 
162  // We are not after parsing a #.
164 
165  // We are not after parsing #include.
166  ParsingFilename = false;
167 
168  // We are not in raw mode. Raw mode disables diagnostics and interpretation
169  // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
170  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
171  // or otherwise skipping over tokens.
172  LexingRawMode = false;
173 
174  // Default to not keeping comments.
175  ExtendedTokenMode = 0;
176 
177  NewLinePtr = nullptr;
178 }
179 
180 /// Lexer constructor - Create a new lexer object for the specified buffer
181 /// with the specified preprocessor managing the lexing process. This lexer
182 /// assumes that the associated file buffer and Preprocessor objects will
183 /// outlive it, so it doesn't take ownership of either of them.
184 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
185  Preprocessor &PP, bool IsFirstIncludeOfFile)
186  : PreprocessorLexer(&PP, FID),
187  FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
188  LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
189  IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
190  InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
191  InputFile.getBufferEnd());
192 
194 }
195 
196 /// Lexer constructor - Create a new raw lexer object. This object is only
197 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
198 /// range will outlive it, so it doesn't take ownership of it.
199 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
200  const char *BufStart, const char *BufPtr, const char *BufEnd,
201  bool IsFirstIncludeOfFile)
202  : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
203  IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
204  InitLexer(BufStart, BufPtr, BufEnd);
205 
206  // We *are* in raw mode.
207  LexingRawMode = true;
208 }
209 
210 /// Lexer constructor - Create a new raw lexer object. This object is only
211 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
212 /// range will outlive it, so it doesn't take ownership of it.
213 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
214  const SourceManager &SM, const LangOptions &langOpts,
215  bool IsFirstIncludeOfFile)
216  : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
217  FromFile.getBufferStart(), FromFile.getBufferEnd(),
218  IsFirstIncludeOfFile) {}
219 
221  assert(PP && "Cannot reset token mode without a preprocessor");
222  if (LangOpts.TraditionalCPP)
223  SetKeepWhitespaceMode(true);
224  else
226 }
227 
228 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
229 /// _Pragma expansion. This has a variety of magic semantics that this method
230 /// sets up. It returns a new'd Lexer that must be delete'd when done.
231 ///
232 /// On entrance to this routine, TokStartLoc is a macro location which has a
233 /// spelling loc that indicates the bytes to be lexed for the token and an
234 /// expansion location that indicates where all lexed tokens should be
235 /// "expanded from".
236 ///
237 /// TODO: It would really be nice to make _Pragma just be a wrapper around a
238 /// normal lexer that remaps tokens as they fly by. This would require making
239 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
240 /// interface that could handle this stuff. This would pull GetMappedTokenLoc
241 /// out of the critical path of the lexer!
242 ///
244  SourceLocation ExpansionLocStart,
245  SourceLocation ExpansionLocEnd,
246  unsigned TokLen, Preprocessor &PP) {
248 
249  // Create the lexer as if we were going to lex the file normally.
250  FileID SpellingFID = SM.getFileID(SpellingLoc);
251  llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
252  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
253 
254  // Now that the lexer is created, change the start/end locations so that we
255  // just lex the subsection of the file that we want. This is lexing from a
256  // scratch buffer.
257  const char *StrData = SM.getCharacterData(SpellingLoc);
258 
259  L->BufferPtr = StrData;
260  L->BufferEnd = StrData+TokLen;
261  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
262 
263  // Set the SourceLocation with the remapping information. This ensures that
264  // GetMappedTokenLoc will remap the tokens as they are lexed.
265  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
266  ExpansionLocStart,
267  ExpansionLocEnd, TokLen);
268 
269  // Ensure that the lexer thinks it is inside a directive, so that end \n will
270  // return an EOD token.
272 
273  // This lexer really is for _Pragma.
274  L->Is_PragmaLexer = true;
275  return L;
276 }
277 
278 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
279  this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
280  this->IsAtStartOfLine = IsAtStartOfLine;
281  assert((BufferStart + Offset) <= BufferEnd);
282  BufferPtr = BufferStart + Offset;
283 }
284 
285 template <typename T> static void StringifyImpl(T &Str, char Quote) {
286  typename T::size_type i = 0, e = Str.size();
287  while (i < e) {
288  if (Str[i] == '\\' || Str[i] == Quote) {
289  Str.insert(Str.begin() + i, '\\');
290  i += 2;
291  ++e;
292  } else if (Str[i] == '\n' || Str[i] == '\r') {
293  // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
294  if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
295  Str[i] != Str[i + 1]) {
296  Str[i] = '\\';
297  Str[i + 1] = 'n';
298  } else {
299  // Replace '\n' and '\r' to '\\' followed by 'n'.
300  Str[i] = '\\';
301  Str.insert(Str.begin() + i + 1, 'n');
302  ++e;
303  }
304  i += 2;
305  } else
306  ++i;
307  }
308 }
309 
310 std::string Lexer::Stringify(StringRef Str, bool Charify) {
311  std::string Result = std::string(Str);
312  char Quote = Charify ? '\'' : '"';
313  StringifyImpl(Result, Quote);
314  return Result;
315 }
316 
318 
319 //===----------------------------------------------------------------------===//
320 // Token Spelling
321 //===----------------------------------------------------------------------===//
322 
323 /// Slow case of getSpelling. Extract the characters comprising the
324 /// spelling of this token from the provided input buffer.
325 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
326  const LangOptions &LangOpts, char *Spelling) {
327  assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
328 
329  size_t Length = 0;
330  const char *BufEnd = BufPtr + Tok.getLength();
331 
332  if (tok::isStringLiteral(Tok.getKind())) {
333  // Munch the encoding-prefix and opening double-quote.
334  while (BufPtr < BufEnd) {
335  auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
336  Spelling[Length++] = CharAndSize.Char;
337  BufPtr += CharAndSize.Size;
338 
339  if (Spelling[Length - 1] == '"')
340  break;
341  }
342 
343  // Raw string literals need special handling; trigraph expansion and line
344  // splicing do not occur within their d-char-sequence nor within their
345  // r-char-sequence.
346  if (Length >= 2 &&
347  Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
348  // Search backwards from the end of the token to find the matching closing
349  // quote.
350  const char *RawEnd = BufEnd;
351  do --RawEnd; while (*RawEnd != '"');
352  size_t RawLength = RawEnd - BufPtr + 1;
353 
354  // Everything between the quotes is included verbatim in the spelling.
355  memcpy(Spelling + Length, BufPtr, RawLength);
356  Length += RawLength;
357  BufPtr += RawLength;
358 
359  // The rest of the token is lexed normally.
360  }
361  }
362 
363  while (BufPtr < BufEnd) {
364  auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
365  Spelling[Length++] = CharAndSize.Char;
366  BufPtr += CharAndSize.Size;
367  }
368 
369  assert(Length < Tok.getLength() &&
370  "NeedsCleaning flag set on token that didn't need cleaning!");
371  return Length;
372 }
373 
374 /// getSpelling() - Return the 'spelling' of this token. The spelling of a
375 /// token are the characters used to represent the token in the source file
376 /// after trigraph expansion and escaped-newline folding. In particular, this
377 /// wants to get the true, uncanonicalized, spelling of things like digraphs
378 /// UCNs, etc.
380  SmallVectorImpl<char> &buffer,
381  const SourceManager &SM,
382  const LangOptions &options,
383  bool *invalid) {
384  // Break down the source location.
385  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
386 
387  // Try to the load the file buffer.
388  bool invalidTemp = false;
389  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
390  if (invalidTemp) {
391  if (invalid) *invalid = true;
392  return {};
393  }
394 
395  const char *tokenBegin = file.data() + locInfo.second;
396 
397  // Lex from the start of the given location.
398  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
399  file.begin(), tokenBegin, file.end());
400  Token token;
401  lexer.LexFromRawLexer(token);
402 
403  unsigned length = token.getLength();
404 
405  // Common case: no need for cleaning.
406  if (!token.needsCleaning())
407  return StringRef(tokenBegin, length);
408 
409  // Hard case, we need to relex the characters into the string.
410  buffer.resize(length);
411  buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
412  return StringRef(buffer.data(), buffer.size());
413 }
414 
415 /// getSpelling() - Return the 'spelling' of this token. The spelling of a
416 /// token are the characters used to represent the token in the source file
417 /// after trigraph expansion and escaped-newline folding. In particular, this
418 /// wants to get the true, uncanonicalized, spelling of things like digraphs
419 /// UCNs, etc.
420 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
421  const LangOptions &LangOpts, bool *Invalid) {
422  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
423 
424  bool CharDataInvalid = false;
425  const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
426  &CharDataInvalid);
427  if (Invalid)
428  *Invalid = CharDataInvalid;
429  if (CharDataInvalid)
430  return {};
431 
432  // If this token contains nothing interesting, return it directly.
433  if (!Tok.needsCleaning())
434  return std::string(TokStart, TokStart + Tok.getLength());
435 
436  std::string Result;
437  Result.resize(Tok.getLength());
438  Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
439  return Result;
440 }
441 
442 /// getSpelling - This method is used to get the spelling of a token into a
443 /// preallocated buffer, instead of as an std::string. The caller is required
444 /// to allocate enough space for the token, which is guaranteed to be at least
445 /// Tok.getLength() bytes long. The actual length of the token is returned.
446 ///
447 /// Note that this method may do two possible things: it may either fill in
448 /// the buffer specified with characters, or it may *change the input pointer*
449 /// to point to a constant buffer with the data already in it (avoiding a
450 /// copy). The caller is not allowed to modify the returned buffer pointer
451 /// if an internal buffer is returned.
452 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
453  const SourceManager &SourceMgr,
454  const LangOptions &LangOpts, bool *Invalid) {
455  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
456 
457  const char *TokStart = nullptr;
458  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
459  if (Tok.is(tok::raw_identifier))
460  TokStart = Tok.getRawIdentifier().data();
461  else if (!Tok.hasUCN()) {
462  if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
463  // Just return the string from the identifier table, which is very quick.
464  Buffer = II->getNameStart();
465  return II->getLength();
466  }
467  }
468 
469  // NOTE: this can be checked even after testing for an IdentifierInfo.
470  if (Tok.isLiteral())
471  TokStart = Tok.getLiteralData();
472 
473  if (!TokStart) {
474  // Compute the start of the token in the input lexer buffer.
475  bool CharDataInvalid = false;
476  TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
477  if (Invalid)
478  *Invalid = CharDataInvalid;
479  if (CharDataInvalid) {
480  Buffer = "";
481  return 0;
482  }
483  }
484 
485  // If this token contains nothing interesting, return it directly.
486  if (!Tok.needsCleaning()) {
487  Buffer = TokStart;
488  return Tok.getLength();
489  }
490 
491  // Otherwise, hard case, relex the characters into the string.
492  return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
493 }
494 
495 /// MeasureTokenLength - Relex the token at the specified location and return
496 /// its length in bytes in the input file. If the token needs cleaning (e.g.
497 /// includes a trigraph or an escaped newline) then this count includes bytes
498 /// that are part of that.
500  const SourceManager &SM,
501  const LangOptions &LangOpts) {
502  Token TheTok;
503  if (getRawToken(Loc, TheTok, SM, LangOpts))
504  return 0;
505  return TheTok.getLength();
506 }
507 
508 /// Relex the token at the specified location.
509 /// \returns true if there was a failure, false on success.
511  const SourceManager &SM,
512  const LangOptions &LangOpts,
513  bool IgnoreWhiteSpace) {
514  // TODO: this could be special cased for common tokens like identifiers, ')',
515  // etc to make this faster, if it mattered. Just look at StrData[0] to handle
516  // all obviously single-char tokens. This could use
517  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
518  // something.
519 
520  // If this comes from a macro expansion, we really do want the macro name, not
521  // the token this macro expanded to.
522  Loc = SM.getExpansionLoc(Loc);
523  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
524  bool Invalid = false;
525  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526  if (Invalid)
527  return true;
528 
529  const char *StrData = Buffer.data()+LocInfo.second;
530 
531  if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
532  return true;
533 
534  // Create a lexer starting at the beginning of this token.
535  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
536  Buffer.begin(), StrData, Buffer.end());
537  TheLexer.SetCommentRetentionState(true);
538  TheLexer.LexFromRawLexer(Result);
539  return false;
540 }
541 
542 /// Returns the pointer that points to the beginning of line that contains
543 /// the given offset, or null if the offset if invalid.
544 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
545  const char *BufStart = Buffer.data();
546  if (Offset >= Buffer.size())
547  return nullptr;
548 
549  const char *LexStart = BufStart + Offset;
550  for (; LexStart != BufStart; --LexStart) {
551  if (isVerticalWhitespace(LexStart[0]) &&
552  !Lexer::isNewLineEscaped(BufStart, LexStart)) {
553  // LexStart should point at first character of logical line.
554  ++LexStart;
555  break;
556  }
557  }
558  return LexStart;
559 }
560 
562  const SourceManager &SM,
563  const LangOptions &LangOpts) {
564  assert(Loc.isFileID());
565  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
566  if (LocInfo.first.isInvalid())
567  return Loc;
568 
569  bool Invalid = false;
570  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
571  if (Invalid)
572  return Loc;
573 
574  // Back up from the current location until we hit the beginning of a line
575  // (or the buffer). We'll relex from that point.
576  const char *StrData = Buffer.data() + LocInfo.second;
577  const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
578  if (!LexStart || LexStart == StrData)
579  return Loc;
580 
581  // Create a lexer starting at the beginning of this token.
582  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
583  Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
584  Buffer.end());
585  TheLexer.SetCommentRetentionState(true);
586 
587  // Lex tokens until we find the token that contains the source location.
588  Token TheTok;
589  do {
590  TheLexer.LexFromRawLexer(TheTok);
591 
592  if (TheLexer.getBufferLocation() > StrData) {
593  // Lexing this token has taken the lexer past the source location we're
594  // looking for. If the current token encompasses our source location,
595  // return the beginning of that token.
596  if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
597  return TheTok.getLocation();
598 
599  // We ended up skipping over the source location entirely, which means
600  // that it points into whitespace. We're done here.
601  break;
602  }
603  } while (TheTok.getKind() != tok::eof);
604 
605  // We've passed our source location; just return the original source location.
606  return Loc;
607 }
608 
610  const SourceManager &SM,
611  const LangOptions &LangOpts) {
612  if (Loc.isFileID())
613  return getBeginningOfFileToken(Loc, SM, LangOpts);
614 
615  if (!SM.isMacroArgExpansion(Loc))
616  return Loc;
617 
618  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
619  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
620  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
621  std::pair<FileID, unsigned> BeginFileLocInfo =
622  SM.getDecomposedLoc(BeginFileLoc);
623  assert(FileLocInfo.first == BeginFileLocInfo.first &&
624  FileLocInfo.second >= BeginFileLocInfo.second);
625  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
626 }
627 
628 namespace {
629 
630 enum PreambleDirectiveKind {
631  PDK_Skipped,
632  PDK_Unknown
633 };
634 
635 } // namespace
636 
638  const LangOptions &LangOpts,
639  unsigned MaxLines) {
640  // Create a lexer starting at the beginning of the file. Note that we use a
641  // "fake" file source location at offset 1 so that the lexer will track our
642  // position within the file.
643  const SourceLocation::UIntTy StartOffset = 1;
644  SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
645  Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
646  Buffer.end());
647  TheLexer.SetCommentRetentionState(true);
648 
649  bool InPreprocessorDirective = false;
650  Token TheTok;
651  SourceLocation ActiveCommentLoc;
652 
653  unsigned MaxLineOffset = 0;
654  if (MaxLines) {
655  const char *CurPtr = Buffer.begin();
656  unsigned CurLine = 0;
657  while (CurPtr != Buffer.end()) {
658  char ch = *CurPtr++;
659  if (ch == '\n') {
660  ++CurLine;
661  if (CurLine == MaxLines)
662  break;
663  }
664  }
665  if (CurPtr != Buffer.end())
666  MaxLineOffset = CurPtr - Buffer.begin();
667  }
668 
669  do {
670  TheLexer.LexFromRawLexer(TheTok);
671 
672  if (InPreprocessorDirective) {
673  // If we've hit the end of the file, we're done.
674  if (TheTok.getKind() == tok::eof) {
675  break;
676  }
677 
678  // If we haven't hit the end of the preprocessor directive, skip this
679  // token.
680  if (!TheTok.isAtStartOfLine())
681  continue;
682 
683  // We've passed the end of the preprocessor directive, and will look
684  // at this token again below.
685  InPreprocessorDirective = false;
686  }
687 
688  // Keep track of the # of lines in the preamble.
689  if (TheTok.isAtStartOfLine()) {
690  unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
691 
692  // If we were asked to limit the number of lines in the preamble,
693  // and we're about to exceed that limit, we're done.
694  if (MaxLineOffset && TokOffset >= MaxLineOffset)
695  break;
696  }
697 
698  // Comments are okay; skip over them.
699  if (TheTok.getKind() == tok::comment) {
700  if (ActiveCommentLoc.isInvalid())
701  ActiveCommentLoc = TheTok.getLocation();
702  continue;
703  }
704 
705  if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
706  // This is the start of a preprocessor directive.
707  Token HashTok = TheTok;
708  InPreprocessorDirective = true;
709  ActiveCommentLoc = SourceLocation();
710 
711  // Figure out which directive this is. Since we're lexing raw tokens,
712  // we don't have an identifier table available. Instead, just look at
713  // the raw identifier to recognize and categorize preprocessor directives.
714  TheLexer.LexFromRawLexer(TheTok);
715  if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
716  StringRef Keyword = TheTok.getRawIdentifier();
717  PreambleDirectiveKind PDK
718  = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
719  .Case("include", PDK_Skipped)
720  .Case("__include_macros", PDK_Skipped)
721  .Case("define", PDK_Skipped)
722  .Case("undef", PDK_Skipped)
723  .Case("line", PDK_Skipped)
724  .Case("error", PDK_Skipped)
725  .Case("pragma", PDK_Skipped)
726  .Case("import", PDK_Skipped)
727  .Case("include_next", PDK_Skipped)
728  .Case("warning", PDK_Skipped)
729  .Case("ident", PDK_Skipped)
730  .Case("sccs", PDK_Skipped)
731  .Case("assert", PDK_Skipped)
732  .Case("unassert", PDK_Skipped)
733  .Case("if", PDK_Skipped)
734  .Case("ifdef", PDK_Skipped)
735  .Case("ifndef", PDK_Skipped)
736  .Case("elif", PDK_Skipped)
737  .Case("elifdef", PDK_Skipped)
738  .Case("elifndef", PDK_Skipped)
739  .Case("else", PDK_Skipped)
740  .Case("endif", PDK_Skipped)
741  .Default(PDK_Unknown);
742 
743  switch (PDK) {
744  case PDK_Skipped:
745  continue;
746 
747  case PDK_Unknown:
748  // We don't know what this directive is; stop at the '#'.
749  break;
750  }
751  }
752 
753  // We only end up here if we didn't recognize the preprocessor
754  // directive or it was one that can't occur in the preamble at this
755  // point. Roll back the current token to the location of the '#'.
756  TheTok = HashTok;
757  } else if (TheTok.isAtStartOfLine() &&
758  TheTok.getKind() == tok::raw_identifier &&
759  TheTok.getRawIdentifier() == "module" &&
760  LangOpts.CPlusPlusModules) {
761  // The initial global module fragment introducer "module;" is part of
762  // the preamble, which runs up to the module declaration "module foo;".
763  Token ModuleTok = TheTok;
764  do {
765  TheLexer.LexFromRawLexer(TheTok);
766  } while (TheTok.getKind() == tok::comment);
767  if (TheTok.getKind() != tok::semi) {
768  // Not global module fragment, roll back.
769  TheTok = ModuleTok;
770  break;
771  }
772  continue;
773  }
774 
775  // We hit a token that we don't recognize as being in the
776  // "preprocessing only" part of the file, so we're no longer in
777  // the preamble.
778  break;
779  } while (true);
780 
782  if (ActiveCommentLoc.isValid())
783  End = ActiveCommentLoc; // don't truncate a decl comment.
784  else
785  End = TheTok.getLocation();
786 
787  return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
788  TheTok.isAtStartOfLine());
789 }
790 
791 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
792  const SourceManager &SM,
793  const LangOptions &LangOpts) {
794  // Figure out how many physical characters away the specified expansion
795  // character is. This needs to take into consideration newlines and
796  // trigraphs.
797  bool Invalid = false;
798  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
799 
800  // If they request the first char of the token, we're trivially done.
801  if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
802  return 0;
803 
804  unsigned PhysOffset = 0;
805 
806  // The usual case is that tokens don't contain anything interesting. Skip
807  // over the uninteresting characters. If a token only consists of simple
808  // chars, this method is extremely fast.
809  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
810  if (CharNo == 0)
811  return PhysOffset;
812  ++TokPtr;
813  --CharNo;
814  ++PhysOffset;
815  }
816 
817  // If we have a character that may be a trigraph or escaped newline, use a
818  // lexer to parse it correctly.
819  for (; CharNo; --CharNo) {
820  auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
821  TokPtr += CharAndSize.Size;
822  PhysOffset += CharAndSize.Size;
823  }
824 
825  // Final detail: if we end up on an escaped newline, we want to return the
826  // location of the actual byte of the token. For example foo<newline>bar
827  // advanced by 3 should return the location of b, not of \\. One compounding
828  // detail of this is that the escape may be made by a trigraph.
829  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
830  PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
831 
832  return PhysOffset;
833 }
834 
835 /// Computes the source location just past the end of the
836 /// token at this source location.
837 ///
838 /// This routine can be used to produce a source location that
839 /// points just past the end of the token referenced by \p Loc, and
840 /// is generally used when a diagnostic needs to point just after a
841 /// token where it expected something different that it received. If
842 /// the returned source location would not be meaningful (e.g., if
843 /// it points into a macro), this routine returns an invalid
844 /// source location.
845 ///
846 /// \param Offset an offset from the end of the token, where the source
847 /// location should refer to. The default offset (0) produces a source
848 /// location pointing just past the end of the token; an offset of 1 produces
849 /// a source location pointing to the last character in the token, etc.
851  const SourceManager &SM,
852  const LangOptions &LangOpts) {
853  if (Loc.isInvalid())
854  return {};
855 
856  if (Loc.isMacroID()) {
857  if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
858  return {}; // Points inside the macro expansion.
859  }
860 
861  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
862  if (Len > Offset)
863  Len = Len - Offset;
864  else
865  return Loc;
866 
867  return Loc.getLocWithOffset(Len);
868 }
869 
870 /// Returns true if the given MacroID location points at the first
871 /// token of the macro expansion.
873  const SourceManager &SM,
874  const LangOptions &LangOpts,
875  SourceLocation *MacroBegin) {
876  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
877 
878  SourceLocation expansionLoc;
879  if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
880  return false;
881 
882  if (expansionLoc.isFileID()) {
883  // No other macro expansions, this is the first.
884  if (MacroBegin)
885  *MacroBegin = expansionLoc;
886  return true;
887  }
888 
889  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
890 }
891 
892 /// Returns true if the given MacroID location points at the last
893 /// token of the macro expansion.
895  const SourceManager &SM,
896  const LangOptions &LangOpts,
897  SourceLocation *MacroEnd) {
898  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
899 
900  SourceLocation spellLoc = SM.getSpellingLoc(loc);
901  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
902  if (tokLen == 0)
903  return false;
904 
905  SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
906  SourceLocation expansionLoc;
907  if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
908  return false;
909 
910  if (expansionLoc.isFileID()) {
911  // No other macro expansions.
912  if (MacroEnd)
913  *MacroEnd = expansionLoc;
914  return true;
915  }
916 
917  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
918 }
919 
921  const SourceManager &SM,
922  const LangOptions &LangOpts) {
925  assert(Begin.isFileID() && End.isFileID());
926  if (Range.isTokenRange()) {
927  End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
928  if (End.isInvalid())
929  return {};
930  }
931 
932  // Break down the source locations.
933  FileID FID;
934  unsigned BeginOffs;
935  std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
936  if (FID.isInvalid())
937  return {};
938 
939  unsigned EndOffs;
940  if (!SM.isInFileID(End, FID, &EndOffs) ||
941  BeginOffs > EndOffs)
942  return {};
943 
945 }
946 
947 // Assumes that `Loc` is in an expansion.
949  const SourceManager &SM) {
950  return SM.getSLocEntry(SM.getFileID(Loc))
951  .getExpansion()
952  .isExpansionTokenRange();
953 }
954 
956  const SourceManager &SM,
957  const LangOptions &LangOpts) {
960  if (Begin.isInvalid() || End.isInvalid())
961  return {};
962 
963  if (Begin.isFileID() && End.isFileID())
964  return makeRangeFromFileLocs(Range, SM, LangOpts);
965 
966  if (Begin.isMacroID() && End.isFileID()) {
967  if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
968  return {};
970  return makeRangeFromFileLocs(Range, SM, LangOpts);
971  }
972 
973  if (Begin.isFileID() && End.isMacroID()) {
974  if (Range.isTokenRange()) {
975  if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
976  return {};
977  // Use the *original* end, not the expanded one in `End`.
978  Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
979  } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
980  return {};
981  Range.setEnd(End);
982  return makeRangeFromFileLocs(Range, SM, LangOpts);
983  }
984 
985  assert(Begin.isMacroID() && End.isMacroID());
986  SourceLocation MacroBegin, MacroEnd;
987  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
988  ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
989  &MacroEnd)) ||
990  (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
991  &MacroEnd)))) {
992  Range.setBegin(MacroBegin);
993  Range.setEnd(MacroEnd);
994  // Use the *original* `End`, not the expanded one in `MacroEnd`.
995  if (Range.isTokenRange())
996  Range.setTokenRange(isInExpansionTokenRange(End, SM));
997  return makeRangeFromFileLocs(Range, SM, LangOpts);
998  }
999 
1000  bool Invalid = false;
1001  const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
1002  &Invalid);
1003  if (Invalid)
1004  return {};
1005 
1006  if (BeginEntry.getExpansion().isMacroArgExpansion()) {
1007  const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
1008  &Invalid);
1009  if (Invalid)
1010  return {};
1011 
1012  if (EndEntry.getExpansion().isMacroArgExpansion() &&
1013  BeginEntry.getExpansion().getExpansionLocStart() ==
1014  EndEntry.getExpansion().getExpansionLocStart()) {
1015  Range.setBegin(SM.getImmediateSpellingLoc(Begin));
1016  Range.setEnd(SM.getImmediateSpellingLoc(End));
1017  return makeFileCharRange(Range, SM, LangOpts);
1018  }
1019  }
1020 
1021  return {};
1022 }
1023 
1025  const SourceManager &SM,
1026  const LangOptions &LangOpts,
1027  bool *Invalid) {
1028  Range = makeFileCharRange(Range, SM, LangOpts);
1029  if (Range.isInvalid()) {
1030  if (Invalid) *Invalid = true;
1031  return {};
1032  }
1033 
1034  // Break down the source location.
1035  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
1036  if (beginInfo.first.isInvalid()) {
1037  if (Invalid) *Invalid = true;
1038  return {};
1039  }
1040 
1041  unsigned EndOffs;
1042  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
1043  beginInfo.second > EndOffs) {
1044  if (Invalid) *Invalid = true;
1045  return {};
1046  }
1047 
1048  // Try to the load the file buffer.
1049  bool invalidTemp = false;
1050  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1051  if (invalidTemp) {
1052  if (Invalid) *Invalid = true;
1053  return {};
1054  }
1055 
1056  if (Invalid) *Invalid = false;
1057  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1058 }
1059 
1061  const SourceManager &SM,
1062  const LangOptions &LangOpts) {
1063  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1064 
1065  // Find the location of the immediate macro expansion.
1066  while (true) {
1067  FileID FID = SM.getFileID(Loc);
1068  const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1069  const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1070  Loc = Expansion.getExpansionLocStart();
1071  if (!Expansion.isMacroArgExpansion())
1072  break;
1073 
1074  // For macro arguments we need to check that the argument did not come
1075  // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1076 
1077  // Loc points to the argument id of the macro definition, move to the
1078  // macro expansion.
1079  Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1080  SourceLocation SpellLoc = Expansion.getSpellingLoc();
1081  if (SpellLoc.isFileID())
1082  break; // No inner macro.
1083 
1084  // If spelling location resides in the same FileID as macro expansion
1085  // location, it means there is no inner macro.
1086  FileID MacroFID = SM.getFileID(Loc);
1087  if (SM.isInFileID(SpellLoc, MacroFID))
1088  break;
1089 
1090  // Argument came from inner macro.
1091  Loc = SpellLoc;
1092  }
1093 
1094  // Find the spelling location of the start of the non-argument expansion
1095  // range. This is where the macro name was spelled in order to begin
1096  // expanding this macro.
1097  Loc = SM.getSpellingLoc(Loc);
1098 
1099  // Dig out the buffer where the macro name was spelled and the extents of the
1100  // name so that we can render it into the expansion note.
1101  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1102  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1103  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1104  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1105 }
1106 
1108  SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1109  assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1110  // Walk past macro argument expansions.
1111  while (SM.isMacroArgExpansion(Loc))
1112  Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1113 
1114  // If the macro's spelling isn't FileID or from scratch space, then it's
1115  // actually a token paste or stringization (or similar) and not a macro at
1116  // all.
1117  SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1118  if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1119  return {};
1120 
1121  // Find the spelling location of the start of the non-argument expansion
1122  // range. This is where the macro name was spelled in order to begin
1123  // expanding this macro.
1124  Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1125 
1126  // Dig out the buffer where the macro name was spelled and the extents of the
1127  // name so that we can render it into the expansion note.
1128  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1129  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1130  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1131  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1132 }
1133 
1135  return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1136 }
1137 
1138 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1139  assert(isVerticalWhitespace(Str[0]));
1140  if (Str - 1 < BufferStart)
1141  return false;
1142 
1143  if ((Str[0] == '\n' && Str[-1] == '\r') ||
1144  (Str[0] == '\r' && Str[-1] == '\n')) {
1145  if (Str - 2 < BufferStart)
1146  return false;
1147  --Str;
1148  }
1149  --Str;
1150 
1151  // Rewind to first non-space character:
1152  while (Str > BufferStart && isHorizontalWhitespace(*Str))
1153  --Str;
1154 
1155  return *Str == '\\';
1156 }
1157 
1159  const SourceManager &SM) {
1160  if (Loc.isInvalid() || Loc.isMacroID())
1161  return {};
1162  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1163  if (LocInfo.first.isInvalid())
1164  return {};
1165  bool Invalid = false;
1166  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1167  if (Invalid)
1168  return {};
1169  const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1170  if (!Line)
1171  return {};
1172  StringRef Rest = Buffer.substr(Line - Buffer.data());
1173  size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1174  return NumWhitespaceChars == StringRef::npos
1175  ? ""
1176  : Rest.take_front(NumWhitespaceChars);
1177 }
1178 
1179 //===----------------------------------------------------------------------===//
1180 // Diagnostics forwarding code.
1181 //===----------------------------------------------------------------------===//
1182 
1183 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1184 /// lexer buffer was all expanded at a single point, perform the mapping.
1185 /// This is currently only used for _Pragma implementation, so it is the slow
1186 /// path of the hot getSourceLocation method. Do not allow it to be inlined.
1187 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1188  Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1190  SourceLocation FileLoc,
1191  unsigned CharNo, unsigned TokLen) {
1192  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1193 
1194  // Otherwise, we're lexing "mapped tokens". This is used for things like
1195  // _Pragma handling. Combine the expansion location of FileLoc with the
1196  // spelling location.
1198 
1199  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1200  // characters come from spelling(FileLoc)+Offset.
1201  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1202  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1203 
1204  // Figure out the expansion loc range, which is the range covered by the
1205  // original _Pragma(...) sequence.
1206  CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1207 
1208  return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1209 }
1210 
1211 /// getSourceLocation - Return a source location identifier for the specified
1212 /// offset in the current file.
1214  unsigned TokLen) const {
1215  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1216  "Location out of range for this buffer!");
1217 
1218  // In the normal case, we're just lexing from a simple file buffer, return
1219  // the file id from FileLoc with the offset specified.
1220  unsigned CharNo = Loc-BufferStart;
1221  if (FileLoc.isFileID())
1222  return FileLoc.getLocWithOffset(CharNo);
1223 
1224  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1225  // tokens are lexed from where the _Pragma was defined.
1226  assert(PP && "This doesn't work on raw lexers");
1227  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1228 }
1229 
1230 /// Diag - Forwarding function for diagnostics. This translate a source
1231 /// position in the current buffer into a SourceLocation object for rendering.
1232 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1233  return PP->Diag(getSourceLocation(Loc), DiagID);
1234 }
1235 
1236 //===----------------------------------------------------------------------===//
1237 // Trigraph and Escaped Newline Handling Code.
1238 //===----------------------------------------------------------------------===//
1239 
1240 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1241 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1242 static char GetTrigraphCharForLetter(char Letter) {
1243  switch (Letter) {
1244  default: return 0;
1245  case '=': return '#';
1246  case ')': return ']';
1247  case '(': return '[';
1248  case '!': return '|';
1249  case '\'': return '^';
1250  case '>': return '}';
1251  case '/': return '\\';
1252  case '<': return '{';
1253  case '-': return '~';
1254  }
1255 }
1256 
1257 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1258 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
1259 /// return the result character. Finally, emit a warning about trigraph use
1260 /// whether trigraphs are enabled or not.
1261 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1262  char Res = GetTrigraphCharForLetter(*CP);
1263  if (!Res)
1264  return Res;
1265 
1266  if (!Trigraphs) {
1267  if (L && !L->isLexingRawMode())
1268  L->Diag(CP-2, diag::trigraph_ignored);
1269  return 0;
1270  }
1271 
1272  if (L && !L->isLexingRawMode())
1273  L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1274  return Res;
1275 }
1276 
1277 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1278 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1279 /// trigraph equivalent on entry to this function.
1280 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1281  unsigned Size = 0;
1282  while (isWhitespace(Ptr[Size])) {
1283  ++Size;
1284 
1285  if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1286  continue;
1287 
1288  // If this is a \r\n or \n\r, skip the other half.
1289  if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1290  Ptr[Size-1] != Ptr[Size])
1291  ++Size;
1292 
1293  return Size;
1294  }
1295 
1296  // Not an escaped newline, must be a \t or something else.
1297  return 0;
1298 }
1299 
1300 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1301 /// them), skip over them and return the first non-escaped-newline found,
1302 /// otherwise return P.
1303 const char *Lexer::SkipEscapedNewLines(const char *P) {
1304  while (true) {
1305  const char *AfterEscape;
1306  if (*P == '\\') {
1307  AfterEscape = P+1;
1308  } else if (*P == '?') {
1309  // If not a trigraph for escape, bail out.
1310  if (P[1] != '?' || P[2] != '/')
1311  return P;
1312  // FIXME: Take LangOpts into account; the language might not
1313  // support trigraphs.
1314  AfterEscape = P+3;
1315  } else {
1316  return P;
1317  }
1318 
1319  unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1320  if (NewLineSize == 0) return P;
1321  P = AfterEscape+NewLineSize;
1322  }
1323 }
1324 
1326  const SourceManager &SM,
1327  const LangOptions &LangOpts) {
1328  if (Loc.isMacroID()) {
1329  if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330  return std::nullopt;
1331  }
1332  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1333 
1334  // Break down the source location.
1335  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1336 
1337  // Try to load the file buffer.
1338  bool InvalidTemp = false;
1339  StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1340  if (InvalidTemp)
1341  return std::nullopt;
1342 
1343  const char *TokenBegin = File.data() + LocInfo.second;
1344 
1345  // Lex from the start of the given location.
1346  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1347  TokenBegin, File.end());
1348  // Find the token.
1349  Token Tok;
1350  lexer.LexFromRawLexer(Tok);
1351  return Tok;
1352 }
1353 
1354 /// Checks that the given token is the first token that occurs after the
1355 /// given location (this excludes comments and whitespace). Returns the location
1356 /// immediately after the specified token. If the token is not found or the
1357 /// location is inside a macro, the returned source location will be invalid.
1360  const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1361  std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1362  if (!Tok || Tok->isNot(TKind))
1363  return {};
1364  SourceLocation TokenLoc = Tok->getLocation();
1365 
1366  // Calculate how much whitespace needs to be skipped if any.
1367  unsigned NumWhitespaceChars = 0;
1368  if (SkipTrailingWhitespaceAndNewLine) {
1369  const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1370  unsigned char C = *TokenEnd;
1371  while (isHorizontalWhitespace(C)) {
1372  C = *(++TokenEnd);
1373  NumWhitespaceChars++;
1374  }
1375 
1376  // Skip \r, \n, \r\n, or \n\r
1377  if (C == '\n' || C == '\r') {
1378  char PrevC = C;
1379  C = *(++TokenEnd);
1380  NumWhitespaceChars++;
1381  if ((C == '\n' || C == '\r') && C != PrevC)
1382  NumWhitespaceChars++;
1383  }
1384  }
1385 
1386  return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1387 }
1388 
1389 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1390 /// get its size, and return it. This is tricky in several cases:
1391 /// 1. If currently at the start of a trigraph, we warn about the trigraph,
1392 /// then either return the trigraph (skipping 3 chars) or the '?',
1393 /// depending on whether trigraphs are enabled or not.
1394 /// 2. If this is an escaped newline (potentially with whitespace between
1395 /// the backslash and newline), implicitly skip the newline and return
1396 /// the char after it.
1397 ///
1398 /// This handles the slow/uncommon case of the getCharAndSize method. Here we
1399 /// know that we can accumulate into Size, and that we have already incremented
1400 /// Ptr by Size bytes.
1401 ///
1402 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1403 /// be updated to match.
1404 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1405  unsigned Size = 0;
1406  // If we have a slash, look for an escaped newline.
1407  if (Ptr[0] == '\\') {
1408  ++Size;
1409  ++Ptr;
1410 Slash:
1411  // Common case, backslash-char where the char is not whitespace.
1412  if (!isWhitespace(Ptr[0]))
1413  return {'\\', Size};
1414 
1415  // See if we have optional whitespace characters between the slash and
1416  // newline.
1417  if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1418  // Remember that this token needs to be cleaned.
1419  if (Tok) Tok->setFlag(Token::NeedsCleaning);
1420 
1421  // Warn if there was whitespace between the backslash and newline.
1422  if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1423  Diag(Ptr, diag::backslash_newline_space);
1424 
1425  // Found backslash<whitespace><newline>. Parse the char after it.
1426  Size += EscapedNewLineSize;
1427  Ptr += EscapedNewLineSize;
1428 
1429  // Use slow version to accumulate a correct size field.
1430  auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1431  CharAndSize.Size += Size;
1432  return CharAndSize;
1433  }
1434 
1435  // Otherwise, this is not an escaped newline, just return the slash.
1436  return {'\\', Size};
1437  }
1438 
1439  // If this is a trigraph, process it.
1440  if (Ptr[0] == '?' && Ptr[1] == '?') {
1441  // If this is actually a legal trigraph (not something like "??x"), emit
1442  // a trigraph warning. If so, and if trigraphs are enabled, return it.
1443  if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1444  LangOpts.Trigraphs)) {
1445  // Remember that this token needs to be cleaned.
1446  if (Tok) Tok->setFlag(Token::NeedsCleaning);
1447 
1448  Ptr += 3;
1449  Size += 3;
1450  if (C == '\\') goto Slash;
1451  return {C, Size};
1452  }
1453  }
1454 
1455  // If this is neither, return a single character.
1456  return {*Ptr, Size + 1u};
1457 }
1458 
1459 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1460 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
1461 /// and that we have already incremented Ptr by Size bytes.
1462 ///
1463 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1464 /// be updated to match.
1465 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1466  const LangOptions &LangOpts) {
1467 
1468  unsigned Size = 0;
1469  // If we have a slash, look for an escaped newline.
1470  if (Ptr[0] == '\\') {
1471  ++Size;
1472  ++Ptr;
1473 Slash:
1474  // Common case, backslash-char where the char is not whitespace.
1475  if (!isWhitespace(Ptr[0]))
1476  return {'\\', Size};
1477 
1478  // See if we have optional whitespace characters followed by a newline.
1479  if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1480  // Found backslash<whitespace><newline>. Parse the char after it.
1481  Size += EscapedNewLineSize;
1482  Ptr += EscapedNewLineSize;
1483 
1484  // Use slow version to accumulate a correct size field.
1485  auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1486  CharAndSize.Size += Size;
1487  return CharAndSize;
1488  }
1489 
1490  // Otherwise, this is not an escaped newline, just return the slash.
1491  return {'\\', Size};
1492  }
1493 
1494  // If this is a trigraph, process it.
1495  if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1496  // If this is actually a legal trigraph (not something like "??x"), return
1497  // it.
1498  if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1499  Ptr += 3;
1500  Size += 3;
1501  if (C == '\\') goto Slash;
1502  return {C, Size};
1503  }
1504  }
1505 
1506  // If this is neither, return a single character.
1507  return {*Ptr, Size + 1u};
1508 }
1509 
1510 //===----------------------------------------------------------------------===//
1511 // Helper methods for lexing.
1512 //===----------------------------------------------------------------------===//
1513 
1514 /// Routine that indiscriminately sets the offset into the source file.
1515 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1516  BufferPtr = BufferStart + Offset;
1517  if (BufferPtr > BufferEnd)
1518  BufferPtr = BufferEnd;
1519  // FIXME: What exactly does the StartOfLine bit mean? There are two
1520  // possible meanings for the "start" of the line: the first token on the
1521  // unexpanded line, or the first token on the expanded line.
1522  IsAtStartOfLine = StartOfLine;
1523  IsAtPhysicalStartOfLine = StartOfLine;
1524 }
1525 
1526 static bool isUnicodeWhitespace(uint32_t Codepoint) {
1527  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1529  return UnicodeWhitespaceChars.contains(Codepoint);
1530 }
1531 
1533  llvm::SmallString<5> CharBuf;
1534  llvm::raw_svector_ostream CharOS(CharBuf);
1535  llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1536  return CharBuf;
1537 }
1538 
1539 // To mitigate https://github.com/llvm/llvm-project/issues/54732,
1540 // we allow "Mathematical Notation Characters" in identifiers.
1541 // This is a proposed profile that extends the XID_Start/XID_continue
1542 // with mathematical symbols, superscipts and subscripts digits
1543 // found in some production software.
1544 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1545 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1546  bool IsStart, bool &IsExtension) {
1547  static const llvm::sys::UnicodeCharSet MathStartChars(
1549  static const llvm::sys::UnicodeCharSet MathContinueChars(
1551  if (MathStartChars.contains(C) ||
1552  (!IsStart && MathContinueChars.contains(C))) {
1553  IsExtension = true;
1554  return true;
1555  }
1556  return false;
1557 }
1558 
1559 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1560  bool &IsExtension) {
1561  if (LangOpts.AsmPreprocessor) {
1562  return false;
1563  } else if (LangOpts.DollarIdents && '$' == C) {
1564  return true;
1565  } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1566  // A non-leading codepoint must have the XID_Continue property.
1567  // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1568  // so we need to check both tables.
1569  // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1570  static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1571  static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1572  if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1573  return true;
1574  return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1575  IsExtension);
1576  } else if (LangOpts.C11) {
1577  static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1579  return C11AllowedIDChars.contains(C);
1580  } else {
1581  static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1583  return C99AllowedIDChars.contains(C);
1584  }
1585 }
1586 
1587 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1588  bool &IsExtension) {
1589  assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1590  IsExtension = false;
1591  if (LangOpts.AsmPreprocessor) {
1592  return false;
1593  }
1594  if (LangOpts.CPlusPlus || LangOpts.C23) {
1595  static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1596  if (XIDStartChars.contains(C))
1597  return true;
1598  return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1599  IsExtension);
1600  }
1601  if (!isAllowedIDChar(C, LangOpts, IsExtension))
1602  return false;
1603  if (LangOpts.C11) {
1604  static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1606  return !C11DisallowedInitialIDChars.contains(C);
1607  }
1608  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1610  return !C99DisallowedInitialIDChars.contains(C);
1611 }
1612 
1613 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1614  CharSourceRange Range) {
1615 
1616  static const llvm::sys::UnicodeCharSet MathStartChars(
1618  static const llvm::sys::UnicodeCharSet MathContinueChars(
1620 
1621  (void)MathStartChars;
1622  (void)MathContinueChars;
1623  assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1624  "Unexpected mathematical notation codepoint");
1625  Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1626  << codepointAsHexString(C) << Range;
1627 }
1628 
1629 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1630  const char *End) {
1632  L.getSourceLocation(End));
1633 }
1634 
1635 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1636  CharSourceRange Range, bool IsFirst) {
1637  // Check C99 compatibility.
1638  if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1639  enum {
1640  CannotAppearInIdentifier = 0,
1641  CannotStartIdentifier
1642  };
1643 
1644  static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1646  static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1648  if (!C99AllowedIDChars.contains(C)) {
1649  Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1650  << Range
1651  << CannotAppearInIdentifier;
1652  } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1653  Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1654  << Range
1655  << CannotStartIdentifier;
1656  }
1657  }
1658 }
1659 
1660 /// After encountering UTF-8 character C and interpreting it as an identifier
1661 /// character, check whether it's a homoglyph for a common non-identifier
1662 /// source character that is unlikely to be an intentional identifier
1663 /// character and warn if so.
1664 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1665  CharSourceRange Range) {
1666  // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1667  struct HomoglyphPair {
1668  uint32_t Character;
1669  char LooksLike;
1670  bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1671  };
1672  static constexpr HomoglyphPair SortedHomoglyphs[] = {
1673  {U'\u00ad', 0}, // SOFT HYPHEN
1674  {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1675  {U'\u037e', ';'}, // GREEK QUESTION MARK
1676  {U'\u200b', 0}, // ZERO WIDTH SPACE
1677  {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
1678  {U'\u200d', 0}, // ZERO WIDTH JOINER
1679  {U'\u2060', 0}, // WORD JOINER
1680  {U'\u2061', 0}, // FUNCTION APPLICATION
1681  {U'\u2062', 0}, // INVISIBLE TIMES
1682  {U'\u2063', 0}, // INVISIBLE SEPARATOR
1683  {U'\u2064', 0}, // INVISIBLE PLUS
1684  {U'\u2212', '-'}, // MINUS SIGN
1685  {U'\u2215', '/'}, // DIVISION SLASH
1686  {U'\u2216', '\\'}, // SET MINUS
1687  {U'\u2217', '*'}, // ASTERISK OPERATOR
1688  {U'\u2223', '|'}, // DIVIDES
1689  {U'\u2227', '^'}, // LOGICAL AND
1690  {U'\u2236', ':'}, // RATIO
1691  {U'\u223c', '~'}, // TILDE OPERATOR
1692  {U'\ua789', ':'}, // MODIFIER LETTER COLON
1693  {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
1694  {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1695  {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1696  {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1697  {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1698  {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1699  {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1700  {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1701  {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1702  {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1703  {U'\uff0c', ','}, // FULLWIDTH COMMA
1704  {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1705  {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1706  {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1707  {U'\uff1a', ':'}, // FULLWIDTH COLON
1708  {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1709  {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1710  {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1711  {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1712  {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1713  {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1714  {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1715  {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1716  {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1717  {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1718  {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1719  {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1720  {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1721  {U'\uff5e', '~'}, // FULLWIDTH TILDE
1722  {0, 0}
1723  };
1724  auto Homoglyph =
1725  std::lower_bound(std::begin(SortedHomoglyphs),
1726  std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1727  if (Homoglyph->Character == C) {
1728  if (Homoglyph->LooksLike) {
1729  const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1730  Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1731  << Range << codepointAsHexString(C) << LooksLikeStr;
1732  } else {
1733  Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1734  << Range << codepointAsHexString(C);
1735  }
1736  }
1737 }
1738 
1740  DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1741  CharSourceRange Range, bool IsFirst) {
1742  if (isASCII(CodePoint))
1743  return;
1744 
1745  bool IsExtension;
1746  bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1747  bool IsIDContinue =
1748  IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1749 
1750  if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1751  return;
1752 
1753  bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1754 
1755  if (!IsFirst || InvalidOnlyAtStart) {
1756  Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1757  << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1759  } else {
1760  Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1761  << Range << codepointAsHexString(CodePoint)
1763  }
1764 }
1765 
1766 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1767  Token &Result) {
1768  const char *UCNPtr = CurPtr + Size;
1769  uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1770  if (CodePoint == 0) {
1771  return false;
1772  }
1773  bool IsExtension = false;
1774  if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1775  if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1776  return false;
1780  PP->getDiagnostics(), LangOpts, CodePoint,
1781  makeCharRange(*this, CurPtr, UCNPtr),
1782  /*IsFirst=*/false);
1783 
1784  // We got a unicode codepoint that is neither a space nor a
1785  // a valid identifier part.
1786  // Carry on as if the codepoint was valid for recovery purposes.
1787  } else if (!isLexingRawMode()) {
1788  if (IsExtension)
1790  makeCharRange(*this, CurPtr, UCNPtr));
1791 
1793  makeCharRange(*this, CurPtr, UCNPtr),
1794  /*IsFirst=*/false);
1795  }
1796 
1797  Result.setFlag(Token::HasUCN);
1798  if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
1799  (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1800  CurPtr = UCNPtr;
1801  else
1802  while (CurPtr != UCNPtr)
1803  (void)getAndAdvanceChar(CurPtr, Result);
1804  return true;
1805 }
1806 
1807 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1808  llvm::UTF32 CodePoint;
1809 
1810  // If a UTF-8 codepoint appears immediately after an escaped new line,
1811  // CurPtr may point to the splicing \ on the preceding line,
1812  // so we need to skip it.
1813  unsigned FirstCodeUnitSize;
1814  getCharAndSize(CurPtr, FirstCodeUnitSize);
1815  const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1816  const char *UnicodePtr = CharStart;
1817 
1818  llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1819  (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1820  &CodePoint, llvm::strictConversion);
1821  if (ConvResult != llvm::conversionOK)
1822  return false;
1823 
1824  bool IsExtension = false;
1825  if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1826  IsExtension)) {
1827  if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1828  return false;
1829 
1833  PP->getDiagnostics(), LangOpts, CodePoint,
1834  makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1835  // We got a unicode codepoint that is neither a space nor a
1836  // a valid identifier part. Carry on as if the codepoint was
1837  // valid for recovery purposes.
1838  } else if (!isLexingRawMode()) {
1839  if (IsExtension)
1841  PP->getDiagnostics(), CodePoint,
1842  makeCharRange(*this, CharStart, UnicodePtr));
1844  makeCharRange(*this, CharStart, UnicodePtr),
1845  /*IsFirst=*/false);
1847  makeCharRange(*this, CharStart, UnicodePtr));
1848  }
1849 
1850  // Once we sucessfully parsed some UTF-8,
1851  // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1852  // being lexed, and that warnings about trailing spaces are emitted.
1853  ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1854  CurPtr = UnicodePtr;
1855  return true;
1856 }
1857 
1858 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1859  const char *CurPtr) {
1860  bool IsExtension = false;
1861  if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1863  !PP->isPreprocessedOutput()) {
1864  if (IsExtension)
1866  makeCharRange(*this, BufferPtr, CurPtr));
1868  makeCharRange(*this, BufferPtr, CurPtr),
1869  /*IsFirst=*/true);
1871  makeCharRange(*this, BufferPtr, CurPtr));
1872  }
1873 
1874  MIOpt.ReadToken();
1875  return LexIdentifierContinue(Result, CurPtr);
1876  }
1877 
1879  !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1880  !isUnicodeWhitespace(C)) {
1881  // Non-ASCII characters tend to creep into source code unintentionally.
1882  // Instead of letting the parser complain about the unknown token,
1883  // just drop the character.
1884  // Note that we can /only/ do this when the non-ASCII character is actually
1885  // spelled as Unicode, not written as a UCN. The standard requires that
1886  // we not throw away any possible preprocessor tokens, but there's a
1887  // loophole in the mapping of Unicode characters to basic character set
1888  // characters that allows us to map these particular characters to, say,
1889  // whitespace.
1891  PP->getDiagnostics(), LangOpts, C,
1892  makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1893  BufferPtr = CurPtr;
1894  return false;
1895  }
1896 
1897  // Otherwise, we have an explicit UCN or a character that's unlikely to show
1898  // up by accident.
1899  MIOpt.ReadToken();
1900  FormTokenWithChars(Result, CurPtr, tok::unknown);
1901  return true;
1902 }
1903 
1904 static const char *
1905 fastParseASCIIIdentifier(const char *CurPtr,
1906  [[maybe_unused]] const char *BufferEnd) {
1907 #ifdef __SSE4_2__
1908  alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1909  '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1910  };
1911  constexpr ssize_t BytesPerRegister = 16;
1912 
1913  __m128i AsciiIdentifierRangeV =
1914  _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1915 
1916  while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1917  __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1918 
1919  int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1922  CurPtr += Consumed;
1923  if (Consumed == BytesPerRegister)
1924  continue;
1925  return CurPtr;
1926  }
1927 #endif
1928 
1929  unsigned char C = *CurPtr;
1930  while (isAsciiIdentifierContinue(C))
1931  C = *++CurPtr;
1932  return CurPtr;
1933 }
1934 
1935 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1936  // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1937 
1938  while (true) {
1939 
1940  CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1941 
1942  unsigned Size;
1943  // Slow path: handle trigraph, unicode codepoints, UCNs.
1944  unsigned char C = getCharAndSize(CurPtr, Size);
1945  if (isAsciiIdentifierContinue(C)) {
1946  CurPtr = ConsumeChar(CurPtr, Size, Result);
1947  continue;
1948  }
1949  if (C == '$') {
1950  // If we hit a $ and they are not supported in identifiers, we are done.
1951  if (!LangOpts.DollarIdents)
1952  break;
1953  // Otherwise, emit a diagnostic and continue.
1954  if (!isLexingRawMode())
1955  Diag(CurPtr, diag::ext_dollar_in_identifier);
1956  CurPtr = ConsumeChar(CurPtr, Size, Result);
1957  continue;
1958  }
1959  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1960  continue;
1961  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1962  continue;
1963  // Neither an expected Unicode codepoint nor a UCN.
1964  break;
1965  }
1966 
1967  const char *IdStart = BufferPtr;
1968  FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1969  Result.setRawIdentifierData(IdStart);
1970 
1971  // If we are in raw mode, return this identifier raw. There is no need to
1972  // look up identifier information or attempt to macro expand it.
1973  if (LexingRawMode)
1974  return true;
1975 
1976  // Fill in Result.IdentifierInfo and update the token kind,
1977  // looking up the identifier in the identifier table.
1978  const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1979  // Note that we have to call PP->LookUpIdentifierInfo() even for code
1980  // completion, it writes IdentifierInfo into Result, and callers rely on it.
1981 
1982  // If the completion point is at the end of an identifier, we want to treat
1983  // the identifier as incomplete even if it resolves to a macro or a keyword.
1984  // This allows e.g. 'class^' to complete to 'classifier'.
1985  if (isCodeCompletionPoint(CurPtr)) {
1986  // Return the code-completion token.
1987  Result.setKind(tok::code_completion);
1988  // Skip the code-completion char and all immediate identifier characters.
1989  // This ensures we get consistent behavior when completing at any point in
1990  // an identifier (i.e. at the start, in the middle, at the end). Note that
1991  // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1992  // simpler.
1993  assert(*CurPtr == 0 && "Completion character must be 0");
1994  ++CurPtr;
1995  // Note that code completion token is not added as a separate character
1996  // when the completion point is at the end of the buffer. Therefore, we need
1997  // to check if the buffer has ended.
1998  if (CurPtr < BufferEnd) {
1999  while (isAsciiIdentifierContinue(*CurPtr))
2000  ++CurPtr;
2001  }
2002  BufferPtr = CurPtr;
2003  return true;
2004  }
2005 
2006  // Finally, now that we know we have an identifier, pass this off to the
2007  // preprocessor, which may macro expand it or something.
2008  if (II->isHandleIdentifierCase())
2009  return PP->HandleIdentifier(Result);
2010 
2011  return true;
2012 }
2013 
2014 /// isHexaLiteral - Return true if Start points to a hex constant.
2015 /// in microsoft mode (where this is supposed to be several different tokens).
2016 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
2017  auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
2018  char C1 = CharAndSize1.Char;
2019  if (C1 != '0')
2020  return false;
2021 
2022  auto CharAndSize2 =
2023  Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
2024  char C2 = CharAndSize2.Char;
2025  return (C2 == 'x' || C2 == 'X');
2026 }
2027 
2028 /// LexNumericConstant - Lex the remainder of a integer or floating point
2029 /// constant. From[-1] is the first character lexed. Return the end of the
2030 /// constant.
2031 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
2032  unsigned Size;
2033  char C = getCharAndSize(CurPtr, Size);
2034  char PrevCh = 0;
2035  while (isPreprocessingNumberBody(C)) {
2036  CurPtr = ConsumeChar(CurPtr, Size, Result);
2037  PrevCh = C;
2038  if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
2039  CurPtr -= Size;
2040  break;
2041  }
2042  C = getCharAndSize(CurPtr, Size);
2043  }
2044 
2045  // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
2046  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2047  // If we are in Microsoft mode, don't continue if the constant is hex.
2048  // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2049  if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2050  return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2051  }
2052 
2053  // If we have a hex FP constant, continue.
2054  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2055  // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2056  // not-quite-conforming extension. Only do so if this looks like it's
2057  // actually meant to be a hexfloat, and not if it has a ud-suffix.
2058  bool IsHexFloat = true;
2059  if (!LangOpts.C99) {
2060  if (!isHexaLiteral(BufferPtr, LangOpts))
2061  IsHexFloat = false;
2062  else if (!LangOpts.CPlusPlus17 &&
2063  std::find(BufferPtr, CurPtr, '_') != CurPtr)
2064  IsHexFloat = false;
2065  }
2066  if (IsHexFloat)
2067  return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2068  }
2069 
2070  // If we have a digit separator, continue.
2071  if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2072  auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2073  if (isAsciiIdentifierContinue(Next)) {
2074  if (!isLexingRawMode())
2075  Diag(CurPtr, LangOpts.CPlusPlus
2076  ? diag::warn_cxx11_compat_digit_separator
2077  : diag::warn_c23_compat_digit_separator);
2078  CurPtr = ConsumeChar(CurPtr, Size, Result);
2079  CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2080  return LexNumericConstant(Result, CurPtr);
2081  }
2082  }
2083 
2084  // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2085  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2086  return LexNumericConstant(Result, CurPtr);
2087  if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2088  return LexNumericConstant(Result, CurPtr);
2089 
2090  // Update the location of token as well as BufferPtr.
2091  const char *TokStart = BufferPtr;
2092  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2093  Result.setLiteralData(TokStart);
2094  return true;
2095 }
2096 
2097 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2098 /// in C++11, or warn on a ud-suffix in C++98.
2099 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2100  bool IsStringLiteral) {
2101  assert(LangOpts.CPlusPlus);
2102 
2103  // Maximally munch an identifier.
2104  unsigned Size;
2105  char C = getCharAndSize(CurPtr, Size);
2106  bool Consumed = false;
2107 
2108  if (!isAsciiIdentifierStart(C)) {
2109  if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2110  Consumed = true;
2111  else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2112  Consumed = true;
2113  else
2114  return CurPtr;
2115  }
2116 
2117  if (!LangOpts.CPlusPlus11) {
2118  if (!isLexingRawMode())
2119  Diag(CurPtr,
2120  C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2121  : diag::warn_cxx11_compat_reserved_user_defined_literal)
2123  return CurPtr;
2124  }
2125 
2126  // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2127  // that does not start with an underscore is ill-formed. As a conforming
2128  // extension, we treat all such suffixes as if they had whitespace before
2129  // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2130  // likely to be a ud-suffix than a macro, however, and accept that.
2131  if (!Consumed) {
2132  bool IsUDSuffix = false;
2133  if (C == '_')
2134  IsUDSuffix = true;
2135  else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2136  // In C++1y, we need to look ahead a few characters to see if this is a
2137  // valid suffix for a string literal or a numeric literal (this could be
2138  // the 'operator""if' defining a numeric literal operator).
2139  const unsigned MaxStandardSuffixLength = 3;
2140  char Buffer[MaxStandardSuffixLength] = { C };
2141  unsigned Consumed = Size;
2142  unsigned Chars = 1;
2143  while (true) {
2144  auto [Next, NextSize] =
2145  getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2146  if (!isAsciiIdentifierContinue(Next)) {
2147  // End of suffix. Check whether this is on the allowed list.
2148  const StringRef CompleteSuffix(Buffer, Chars);
2149  IsUDSuffix =
2150  StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2151  break;
2152  }
2153 
2154  if (Chars == MaxStandardSuffixLength)
2155  // Too long: can't be a standard suffix.
2156  break;
2157 
2158  Buffer[Chars++] = Next;
2159  Consumed += NextSize;
2160  }
2161  }
2162 
2163  if (!IsUDSuffix) {
2164  if (!isLexingRawMode())
2165  Diag(CurPtr, LangOpts.MSVCCompat
2166  ? diag::ext_ms_reserved_user_defined_literal
2167  : diag::ext_reserved_user_defined_literal)
2169  return CurPtr;
2170  }
2171 
2172  CurPtr = ConsumeChar(CurPtr, Size, Result);
2173  }
2174 
2175  Result.setFlag(Token::HasUDSuffix);
2176  while (true) {
2177  C = getCharAndSize(CurPtr, Size);
2178  if (isAsciiIdentifierContinue(C)) {
2179  CurPtr = ConsumeChar(CurPtr, Size, Result);
2180  } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2181  } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2182  } else
2183  break;
2184  }
2185 
2186  return CurPtr;
2187 }
2188 
2189 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2190 /// either " or L" or u8" or u" or U".
2191 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2192  tok::TokenKind Kind) {
2193  const char *AfterQuote = CurPtr;
2194  // Does this string contain the \0 character?
2195  const char *NulCharacter = nullptr;
2196 
2197  if (!isLexingRawMode() &&
2198  (Kind == tok::utf8_string_literal ||
2199  Kind == tok::utf16_string_literal ||
2200  Kind == tok::utf32_string_literal))
2201  Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2202  : diag::warn_c99_compat_unicode_literal);
2203 
2204  char C = getAndAdvanceChar(CurPtr, Result);
2205  while (C != '"') {
2206  // Skip escaped characters. Escaped newlines will already be processed by
2207  // getAndAdvanceChar.
2208  if (C == '\\')
2209  C = getAndAdvanceChar(CurPtr, Result);
2210 
2211  if (C == '\n' || C == '\r' || // Newline.
2212  (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2213  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2214  Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2215  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2216  return true;
2217  }
2218 
2219  if (C == 0) {
2220  if (isCodeCompletionPoint(CurPtr-1)) {
2221  if (ParsingFilename)
2222  codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2223  else
2225  FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2226  cutOffLexing();
2227  return true;
2228  }
2229 
2230  NulCharacter = CurPtr-1;
2231  }
2232  C = getAndAdvanceChar(CurPtr, Result);
2233  }
2234 
2235  // If we are in C++11, lex the optional ud-suffix.
2236  if (LangOpts.CPlusPlus)
2237  CurPtr = LexUDSuffix(Result, CurPtr, true);
2238 
2239  // If a nul character existed in the string, warn about it.
2240  if (NulCharacter && !isLexingRawMode())
2241  Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2242 
2243  // Update the location of the token as well as the BufferPtr instance var.
2244  const char *TokStart = BufferPtr;
2245  FormTokenWithChars(Result, CurPtr, Kind);
2246  Result.setLiteralData(TokStart);
2247  return true;
2248 }
2249 
2250 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2251 /// having lexed R", LR", u8R", uR", or UR".
2252 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2253  tok::TokenKind Kind) {
2254  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2255  // Between the initial and final double quote characters of the raw string,
2256  // any transformations performed in phases 1 and 2 (trigraphs,
2257  // universal-character-names, and line splicing) are reverted.
2258 
2259  if (!isLexingRawMode())
2260  Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2261 
2262  unsigned PrefixLen = 0;
2263 
2264  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2265  ++PrefixLen;
2266  if (!isLexingRawMode() &&
2267  llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2268  const char *Pos = &CurPtr[PrefixLen];
2269  Diag(Pos, LangOpts.CPlusPlus26
2270  ? diag::warn_cxx26_compat_raw_string_literal_character_set
2271  : diag::ext_cxx26_raw_string_literal_character_set)
2272  << StringRef(Pos, 1);
2273  }
2274  }
2275 
2276  // If the last character was not a '(', then we didn't lex a valid delimiter.
2277  if (CurPtr[PrefixLen] != '(') {
2278  if (!isLexingRawMode()) {
2279  const char *PrefixEnd = &CurPtr[PrefixLen];
2280  if (PrefixLen == 16) {
2281  Diag(PrefixEnd, diag::err_raw_delim_too_long);
2282  } else if (*PrefixEnd == '\n') {
2283  Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
2284  } else {
2285  Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2286  << StringRef(PrefixEnd, 1);
2287  }
2288  }
2289 
2290  // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2291  // it's possible the '"' was intended to be part of the raw string, but
2292  // there's not much we can do about that.
2293  while (true) {
2294  char C = *CurPtr++;
2295 
2296  if (C == '"')
2297  break;
2298  if (C == 0 && CurPtr-1 == BufferEnd) {
2299  --CurPtr;
2300  break;
2301  }
2302  }
2303 
2304  FormTokenWithChars(Result, CurPtr, tok::unknown);
2305  return true;
2306  }
2307 
2308  // Save prefix and move CurPtr past it
2309  const char *Prefix = CurPtr;
2310  CurPtr += PrefixLen + 1; // skip over prefix and '('
2311 
2312  while (true) {
2313  char C = *CurPtr++;
2314 
2315  if (C == ')') {
2316  // Check for prefix match and closing quote.
2317  if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2318  CurPtr += PrefixLen + 1; // skip over prefix and '"'
2319  break;
2320  }
2321  } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2322  if (!isLexingRawMode())
2323  Diag(BufferPtr, diag::err_unterminated_raw_string)
2324  << StringRef(Prefix, PrefixLen);
2325  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2326  return true;
2327  }
2328  }
2329 
2330  // If we are in C++11, lex the optional ud-suffix.
2331  if (LangOpts.CPlusPlus)
2332  CurPtr = LexUDSuffix(Result, CurPtr, true);
2333 
2334  // Update the location of token as well as BufferPtr.
2335  const char *TokStart = BufferPtr;
2336  FormTokenWithChars(Result, CurPtr, Kind);
2337  Result.setLiteralData(TokStart);
2338  return true;
2339 }
2340 
2341 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2342 /// after having lexed the '<' character. This is used for #include filenames.
2343 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2344  // Does this string contain the \0 character?
2345  const char *NulCharacter = nullptr;
2346  const char *AfterLessPos = CurPtr;
2347  char C = getAndAdvanceChar(CurPtr, Result);
2348  while (C != '>') {
2349  // Skip escaped characters. Escaped newlines will already be processed by
2350  // getAndAdvanceChar.
2351  if (C == '\\')
2352  C = getAndAdvanceChar(CurPtr, Result);
2353 
2354  if (isVerticalWhitespace(C) || // Newline.
2355  (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2356  // If the filename is unterminated, then it must just be a lone <
2357  // character. Return this as such.
2358  FormTokenWithChars(Result, AfterLessPos, tok::less);
2359  return true;
2360  }
2361 
2362  if (C == 0) {
2363  if (isCodeCompletionPoint(CurPtr - 1)) {
2364  codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2365  cutOffLexing();
2366  FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2367  return true;
2368  }
2369  NulCharacter = CurPtr-1;
2370  }
2371  C = getAndAdvanceChar(CurPtr, Result);
2372  }
2373 
2374  // If a nul character existed in the string, warn about it.
2375  if (NulCharacter && !isLexingRawMode())
2376  Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2377 
2378  // Update the location of token as well as BufferPtr.
2379  const char *TokStart = BufferPtr;
2380  FormTokenWithChars(Result, CurPtr, tok::header_name);
2381  Result.setLiteralData(TokStart);
2382  return true;
2383 }
2384 
2385 void Lexer::codeCompleteIncludedFile(const char *PathStart,
2386  const char *CompletionPoint,
2387  bool IsAngled) {
2388  // Completion only applies to the filename, after the last slash.
2389  StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2390  llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2391  auto Slash = PartialPath.find_last_of(SlashChars);
2392  StringRef Dir =
2393  (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2394  const char *StartOfFilename =
2395  (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2396  // Code completion filter range is the filename only, up to completion point.
2398  StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2399  // We should replace the characters up to the closing quote or closest slash,
2400  // if any.
2401  while (CompletionPoint < BufferEnd) {
2402  char Next = *(CompletionPoint + 1);
2403  if (Next == 0 || Next == '\r' || Next == '\n')
2404  break;
2405  ++CompletionPoint;
2406  if (Next == (IsAngled ? '>' : '"'))
2407  break;
2408  if (SlashChars.contains(Next))
2409  break;
2410  }
2411 
2413  FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2414  FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2415  PP->CodeCompleteIncludedFile(Dir, IsAngled);
2416 }
2417 
2418 /// LexCharConstant - Lex the remainder of a character constant, after having
2419 /// lexed either ' or L' or u8' or u' or U'.
2420 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2421  tok::TokenKind Kind) {
2422  // Does this character contain the \0 character?
2423  const char *NulCharacter = nullptr;
2424 
2425  if (!isLexingRawMode()) {
2426  if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2427  Diag(BufferPtr, LangOpts.CPlusPlus
2428  ? diag::warn_cxx98_compat_unicode_literal
2429  : diag::warn_c99_compat_unicode_literal);
2430  else if (Kind == tok::utf8_char_constant)
2431  Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2432  }
2433 
2434  char C = getAndAdvanceChar(CurPtr, Result);
2435  if (C == '\'') {
2436  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2437  Diag(BufferPtr, diag::ext_empty_character);
2438  FormTokenWithChars(Result, CurPtr, tok::unknown);
2439  return true;
2440  }
2441 
2442  while (C != '\'') {
2443  // Skip escaped characters.
2444  if (C == '\\')
2445  C = getAndAdvanceChar(CurPtr, Result);
2446 
2447  if (C == '\n' || C == '\r' || // Newline.
2448  (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
2449  if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2450  Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2451  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2452  return true;
2453  }
2454 
2455  if (C == 0) {
2456  if (isCodeCompletionPoint(CurPtr-1)) {
2458  FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2459  cutOffLexing();
2460  return true;
2461  }
2462 
2463  NulCharacter = CurPtr-1;
2464  }
2465  C = getAndAdvanceChar(CurPtr, Result);
2466  }
2467 
2468  // If we are in C++11, lex the optional ud-suffix.
2469  if (LangOpts.CPlusPlus)
2470  CurPtr = LexUDSuffix(Result, CurPtr, false);
2471 
2472  // If a nul character existed in the character, warn about it.
2473  if (NulCharacter && !isLexingRawMode())
2474  Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2475 
2476  // Update the location of token as well as BufferPtr.
2477  const char *TokStart = BufferPtr;
2478  FormTokenWithChars(Result, CurPtr, Kind);
2479  Result.setLiteralData(TokStart);
2480  return true;
2481 }
2482 
2483 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2484 /// Update BufferPtr to point to the next non-whitespace character and return.
2485 ///
2486 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
2487 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2488  bool &TokAtPhysicalStartOfLine) {
2489  // Whitespace - Skip it, then return the token after the whitespace.
2490  bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2491 
2492  unsigned char Char = *CurPtr;
2493 
2494  const char *lastNewLine = nullptr;
2495  auto setLastNewLine = [&](const char *Ptr) {
2496  lastNewLine = Ptr;
2497  if (!NewLinePtr)
2498  NewLinePtr = Ptr;
2499  };
2500  if (SawNewline)
2501  setLastNewLine(CurPtr - 1);
2502 
2503  // Skip consecutive spaces efficiently.
2504  while (true) {
2505  // Skip horizontal whitespace very aggressively.
2506  while (isHorizontalWhitespace(Char))
2507  Char = *++CurPtr;
2508 
2509  // Otherwise if we have something other than whitespace, we're done.
2510  if (!isVerticalWhitespace(Char))
2511  break;
2512 
2514  // End of preprocessor directive line, let LexTokenInternal handle this.
2515  BufferPtr = CurPtr;
2516  return false;
2517  }
2518 
2519  // OK, but handle newline.
2520  if (*CurPtr == '\n')
2521  setLastNewLine(CurPtr);
2522  SawNewline = true;
2523  Char = *++CurPtr;
2524  }
2525 
2526  // If the client wants us to return whitespace, return it now.
2527  if (isKeepWhitespaceMode()) {
2528  FormTokenWithChars(Result, CurPtr, tok::unknown);
2529  if (SawNewline) {
2530  IsAtStartOfLine = true;
2531  IsAtPhysicalStartOfLine = true;
2532  }
2533  // FIXME: The next token will not have LeadingSpace set.
2534  return true;
2535  }
2536 
2537  // If this isn't immediately after a newline, there is leading space.
2538  char PrevChar = CurPtr[-1];
2539  bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2540 
2541  Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2542  if (SawNewline) {
2543  Result.setFlag(Token::StartOfLine);
2544  TokAtPhysicalStartOfLine = true;
2545 
2546  if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2547  if (auto *Handler = PP->getEmptylineHandler())
2548  Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2549  getSourceLocation(lastNewLine)));
2550  }
2551  }
2552 
2553  BufferPtr = CurPtr;
2554  return false;
2555 }
2556 
2557 /// We have just read the // characters from input. Skip until we find the
2558 /// newline character that terminates the comment. Then update BufferPtr and
2559 /// return.
2560 ///
2561 /// If we're in KeepCommentMode or any CommentHandler has inserted
2562 /// some tokens, this will store the first token and return true.
2563 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2564  bool &TokAtPhysicalStartOfLine) {
2565  // If Line comments aren't explicitly enabled for this language, emit an
2566  // extension warning.
2567  if (!LineComment) {
2568  if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2569  Diag(BufferPtr, diag::ext_line_comment);
2570 
2571  // Mark them enabled so we only emit one warning for this translation
2572  // unit.
2573  LineComment = true;
2574  }
2575 
2576  // Scan over the body of the comment. The common case, when scanning, is that
2577  // the comment contains normal ascii characters with nothing interesting in
2578  // them. As such, optimize for this case with the inner loop.
2579  //
2580  // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2581  // character that ends the line comment.
2582 
2583  // C++23 [lex.phases] p1
2584  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2585  // diagnostic only once per entire ill-formed subsequence to avoid
2586  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2587  bool UnicodeDecodingAlreadyDiagnosed = false;
2588 
2589  char C;
2590  while (true) {
2591  C = *CurPtr;
2592  // Skip over characters in the fast loop.
2593  while (isASCII(C) && C != 0 && // Potentially EOF.
2594  C != '\n' && C != '\r') { // Newline or DOS-style newline.
2595  C = *++CurPtr;
2596  UnicodeDecodingAlreadyDiagnosed = false;
2597  }
2598 
2599  if (!isASCII(C)) {
2600  unsigned Length = llvm::getUTF8SequenceSize(
2601  (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2602  if (Length == 0) {
2603  if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2604  Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2605  UnicodeDecodingAlreadyDiagnosed = true;
2606  ++CurPtr;
2607  } else {
2608  UnicodeDecodingAlreadyDiagnosed = false;
2609  CurPtr += Length;
2610  }
2611  continue;
2612  }
2613 
2614  const char *NextLine = CurPtr;
2615  if (C != 0) {
2616  // We found a newline, see if it's escaped.
2617  const char *EscapePtr = CurPtr-1;
2618  bool HasSpace = false;
2619  while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2620  --EscapePtr;
2621  HasSpace = true;
2622  }
2623 
2624  if (*EscapePtr == '\\')
2625  // Escaped newline.
2626  CurPtr = EscapePtr;
2627  else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2628  EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2629  // Trigraph-escaped newline.
2630  CurPtr = EscapePtr-2;
2631  else
2632  break; // This is a newline, we're done.
2633 
2634  // If there was space between the backslash and newline, warn about it.
2635  if (HasSpace && !isLexingRawMode())
2636  Diag(EscapePtr, diag::backslash_newline_space);
2637  }
2638 
2639  // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
2640  // properly decode the character. Read it in raw mode to avoid emitting
2641  // diagnostics about things like trigraphs. If we see an escaped newline,
2642  // we'll handle it below.
2643  const char *OldPtr = CurPtr;
2644  bool OldRawMode = isLexingRawMode();
2645  LexingRawMode = true;
2646  C = getAndAdvanceChar(CurPtr, Result);
2647  LexingRawMode = OldRawMode;
2648 
2649  // If we only read only one character, then no special handling is needed.
2650  // We're done and can skip forward to the newline.
2651  if (C != 0 && CurPtr == OldPtr+1) {
2652  CurPtr = NextLine;
2653  break;
2654  }
2655 
2656  // If we read multiple characters, and one of those characters was a \r or
2657  // \n, then we had an escaped newline within the comment. Emit diagnostic
2658  // unless the next line is also a // comment.
2659  if (CurPtr != OldPtr + 1 && C != '/' &&
2660  (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2661  for (; OldPtr != CurPtr; ++OldPtr)
2662  if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2663  // Okay, we found a // comment that ends in a newline, if the next
2664  // line is also a // comment, but has spaces, don't emit a diagnostic.
2665  if (isWhitespace(C)) {
2666  const char *ForwardPtr = CurPtr;
2667  while (isWhitespace(*ForwardPtr)) // Skip whitespace.
2668  ++ForwardPtr;
2669  if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2670  break;
2671  }
2672 
2673  if (!isLexingRawMode())
2674  Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2675  break;
2676  }
2677  }
2678 
2679  if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2680  --CurPtr;
2681  break;
2682  }
2683 
2684  if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2686  cutOffLexing();
2687  return false;
2688  }
2689  }
2690 
2691  // Found but did not consume the newline. Notify comment handlers about the
2692  // comment unless we're in a #if 0 block.
2693  if (PP && !isLexingRawMode() &&
2694  PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2695  getSourceLocation(CurPtr)))) {
2696  BufferPtr = CurPtr;
2697  return true; // A token has to be returned.
2698  }
2699 
2700  // If we are returning comments as tokens, return this comment as a token.
2701  if (inKeepCommentMode())
2702  return SaveLineComment(Result, CurPtr);
2703 
2704  // If we are inside a preprocessor directive and we see the end of line,
2705  // return immediately, so that the lexer can return this as an EOD token.
2706  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2707  BufferPtr = CurPtr;
2708  return false;
2709  }
2710 
2711  // Otherwise, eat the \n character. We don't care if this is a \n\r or
2712  // \r\n sequence. This is an efficiency hack (because we know the \n can't
2713  // contribute to another token), it isn't needed for correctness. Note that
2714  // this is ok even in KeepWhitespaceMode, because we would have returned the
2715  // comment above in that mode.
2716  NewLinePtr = CurPtr++;
2717 
2718  // The next returned token is at the start of the line.
2719  Result.setFlag(Token::StartOfLine);
2720  TokAtPhysicalStartOfLine = true;
2721  // No leading whitespace seen so far.
2722  Result.clearFlag(Token::LeadingSpace);
2723  BufferPtr = CurPtr;
2724  return false;
2725 }
2726 
2727 /// If in save-comment mode, package up this Line comment in an appropriate
2728 /// way and return it.
2729 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2730  // If we're not in a preprocessor directive, just return the // comment
2731  // directly.
2732  FormTokenWithChars(Result, CurPtr, tok::comment);
2733 
2735  return true;
2736 
2737  // If this Line-style comment is in a macro definition, transmogrify it into
2738  // a C-style block comment.
2739  bool Invalid = false;
2740  std::string Spelling = PP->getSpelling(Result, &Invalid);
2741  if (Invalid)
2742  return true;
2743 
2744  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2745  Spelling[1] = '*'; // Change prefix to "/*".
2746  Spelling += "*/"; // add suffix.
2747 
2748  Result.setKind(tok::comment);
2749  PP->CreateString(Spelling, Result,
2750  Result.getLocation(), Result.getLocation());
2751  return true;
2752 }
2753 
2754 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2755 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue
2756 /// a diagnostic if so. We know that the newline is inside of a block comment.
2757 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2758  bool Trigraphs) {
2759  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2760 
2761  // Position of the first trigraph in the ending sequence.
2762  const char *TrigraphPos = nullptr;
2763  // Position of the first whitespace after a '\' in the ending sequence.
2764  const char *SpacePos = nullptr;
2765 
2766  while (true) {
2767  // Back up off the newline.
2768  --CurPtr;
2769 
2770  // If this is a two-character newline sequence, skip the other character.
2771  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2772  // \n\n or \r\r -> not escaped newline.
2773  if (CurPtr[0] == CurPtr[1])
2774  return false;
2775  // \n\r or \r\n -> skip the newline.
2776  --CurPtr;
2777  }
2778 
2779  // If we have horizontal whitespace, skip over it. We allow whitespace
2780  // between the slash and newline.
2781  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2782  SpacePos = CurPtr;
2783  --CurPtr;
2784  }
2785 
2786  // If we have a slash, this is an escaped newline.
2787  if (*CurPtr == '\\') {
2788  --CurPtr;
2789  } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2790  // This is a trigraph encoding of a slash.
2791  TrigraphPos = CurPtr - 2;
2792  CurPtr -= 3;
2793  } else {
2794  return false;
2795  }
2796 
2797  // If the character preceding the escaped newline is a '*', then after line
2798  // splicing we have a '*/' ending the comment.
2799  if (*CurPtr == '*')
2800  break;
2801 
2802  if (*CurPtr != '\n' && *CurPtr != '\r')
2803  return false;
2804  }
2805 
2806  if (TrigraphPos) {
2807  // If no trigraphs are enabled, warn that we ignored this trigraph and
2808  // ignore this * character.
2809  if (!Trigraphs) {
2810  if (!L->isLexingRawMode())
2811  L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2812  return false;
2813  }
2814  if (!L->isLexingRawMode())
2815  L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2816  }
2817 
2818  // Warn about having an escaped newline between the */ characters.
2819  if (!L->isLexingRawMode())
2820  L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2821 
2822  // If there was space between the backslash and newline, warn about it.
2823  if (SpacePos && !L->isLexingRawMode())
2824  L->Diag(SpacePos, diag::backslash_newline_space);
2825 
2826  return true;
2827 }
2828 
2829 #ifdef __SSE2__
2830 #include <emmintrin.h>
2831 #elif __ALTIVEC__
2832 #include <altivec.h>
2833 #undef bool
2834 #endif
2835 
2836 /// We have just read from input the / and * characters that started a comment.
2837 /// Read until we find the * and / characters that terminate the comment.
2838 /// Note that we don't bother decoding trigraphs or escaped newlines in block
2839 /// comments, because they cannot cause the comment to end. The only thing
2840 /// that can happen is the comment could end with an escaped newline between
2841 /// the terminating * and /.
2842 ///
2843 /// If we're in KeepCommentMode or any CommentHandler has inserted
2844 /// some tokens, this will store the first token and return true.
2845 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2846  bool &TokAtPhysicalStartOfLine) {
2847  // Scan one character past where we should, looking for a '/' character. Once
2848  // we find it, check to see if it was preceded by a *. This common
2849  // optimization helps people who like to put a lot of * characters in their
2850  // comments.
2851 
2852  // The first character we get with newlines and trigraphs skipped to handle
2853  // the degenerate /*/ case below correctly if the * has an escaped newline
2854  // after it.
2855  unsigned CharSize;
2856  unsigned char C = getCharAndSize(CurPtr, CharSize);
2857  CurPtr += CharSize;
2858  if (C == 0 && CurPtr == BufferEnd+1) {
2859  if (!isLexingRawMode())
2860  Diag(BufferPtr, diag::err_unterminated_block_comment);
2861  --CurPtr;
2862 
2863  // KeepWhitespaceMode should return this broken comment as a token. Since
2864  // it isn't a well formed comment, just return it as an 'unknown' token.
2865  if (isKeepWhitespaceMode()) {
2866  FormTokenWithChars(Result, CurPtr, tok::unknown);
2867  return true;
2868  }
2869 
2870  BufferPtr = CurPtr;
2871  return false;
2872  }
2873 
2874  // Check to see if the first character after the '/*' is another /. If so,
2875  // then this slash does not end the block comment, it is part of it.
2876  if (C == '/')
2877  C = *CurPtr++;
2878 
2879  // C++23 [lex.phases] p1
2880  // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2881  // diagnostic only once per entire ill-formed subsequence to avoid
2882  // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2883  bool UnicodeDecodingAlreadyDiagnosed = false;
2884 
2885  while (true) {
2886  // Skip over all non-interesting characters until we find end of buffer or a
2887  // (probably ending) '/' character.
2888  if (CurPtr + 24 < BufferEnd &&
2889  // If there is a code-completion point avoid the fast scan because it
2890  // doesn't check for '\0'.
2891  !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2892  // While not aligned to a 16-byte boundary.
2893  while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2894  if (!isASCII(C))
2895  goto MultiByteUTF8;
2896  C = *CurPtr++;
2897  }
2898  if (C == '/') goto FoundSlash;
2899 
2900 #ifdef __SSE2__
2901  __m128i Slashes = _mm_set1_epi8('/');
2902  while (CurPtr + 16 < BufferEnd) {
2903  int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2904  if (LLVM_UNLIKELY(Mask != 0)) {
2905  goto MultiByteUTF8;
2906  }
2907  // look for slashes
2908  int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2909  Slashes));
2910  if (cmp != 0) {
2911  // Adjust the pointer to point directly after the first slash. It's
2912  // not necessary to set C here, it will be overwritten at the end of
2913  // the outer loop.
2914  CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2915  goto FoundSlash;
2916  }
2917  CurPtr += 16;
2918  }
2919 #elif __ALTIVEC__
2920  __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2921  0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2922  0x80, 0x80, 0x80, 0x80};
2923  __vector unsigned char Slashes = {
2924  '/', '/', '/', '/', '/', '/', '/', '/',
2925  '/', '/', '/', '/', '/', '/', '/', '/'
2926  };
2927  while (CurPtr + 16 < BufferEnd) {
2928  if (LLVM_UNLIKELY(
2929  vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2930  goto MultiByteUTF8;
2931  if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2932  break;
2933  }
2934  CurPtr += 16;
2935  }
2936 
2937 #else
2938  while (CurPtr + 16 < BufferEnd) {
2939  bool HasNonASCII = false;
2940  for (unsigned I = 0; I < 16; ++I)
2941  HasNonASCII |= !isASCII(CurPtr[I]);
2942 
2943  if (LLVM_UNLIKELY(HasNonASCII))
2944  goto MultiByteUTF8;
2945 
2946  bool HasSlash = false;
2947  for (unsigned I = 0; I < 16; ++I)
2948  HasSlash |= CurPtr[I] == '/';
2949  if (HasSlash)
2950  break;
2951  CurPtr += 16;
2952  }
2953 #endif
2954 
2955  // It has to be one of the bytes scanned, increment to it and read one.
2956  C = *CurPtr++;
2957  }
2958 
2959  // Loop to scan the remainder, warning on invalid UTF-8
2960  // if the corresponding warning is enabled, emitting a diagnostic only once
2961  // per sequence that cannot be decoded.
2962  while (C != '/' && C != '\0') {
2963  if (isASCII(C)) {
2964  UnicodeDecodingAlreadyDiagnosed = false;
2965  C = *CurPtr++;
2966  continue;
2967  }
2968  MultiByteUTF8:
2969  // CurPtr is 1 code unit past C, so to decode
2970  // the codepoint, we need to read from the previous position.
2971  unsigned Length = llvm::getUTF8SequenceSize(
2972  (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2973  if (Length == 0) {
2974  if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2975  Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2976  UnicodeDecodingAlreadyDiagnosed = true;
2977  } else {
2978  UnicodeDecodingAlreadyDiagnosed = false;
2979  CurPtr += Length - 1;
2980  }
2981  C = *CurPtr++;
2982  }
2983 
2984  if (C == '/') {
2985  FoundSlash:
2986  if (CurPtr[-2] == '*') // We found the final */. We're done!
2987  break;
2988 
2989  if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2990  if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2991  LangOpts.Trigraphs)) {
2992  // We found the final */, though it had an escaped newline between the
2993  // * and /. We're done!
2994  break;
2995  }
2996  }
2997  if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2998  // If this is a /* inside of the comment, emit a warning. Don't do this
2999  // if this is a /*/, which will end the comment. This misses cases with
3000  // embedded escaped newlines, but oh well.
3001  if (!isLexingRawMode())
3002  Diag(CurPtr-1, diag::warn_nested_block_comment);
3003  }
3004  } else if (C == 0 && CurPtr == BufferEnd+1) {
3005  if (!isLexingRawMode())
3006  Diag(BufferPtr, diag::err_unterminated_block_comment);
3007  // Note: the user probably forgot a */. We could continue immediately
3008  // after the /*, but this would involve lexing a lot of what really is the
3009  // comment, which surely would confuse the parser.
3010  --CurPtr;
3011 
3012  // KeepWhitespaceMode should return this broken comment as a token. Since
3013  // it isn't a well formed comment, just return it as an 'unknown' token.
3014  if (isKeepWhitespaceMode()) {
3015  FormTokenWithChars(Result, CurPtr, tok::unknown);
3016  return true;
3017  }
3018 
3019  BufferPtr = CurPtr;
3020  return false;
3021  } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
3023  cutOffLexing();
3024  return false;
3025  }
3026 
3027  C = *CurPtr++;
3028  }
3029 
3030  // Notify comment handlers about the comment unless we're in a #if 0 block.
3031  if (PP && !isLexingRawMode() &&
3032  PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
3033  getSourceLocation(CurPtr)))) {
3034  BufferPtr = CurPtr;
3035  return true; // A token has to be returned.
3036  }
3037 
3038  // If we are returning comments as tokens, return this comment as a token.
3039  if (inKeepCommentMode()) {
3040  FormTokenWithChars(Result, CurPtr, tok::comment);
3041  return true;
3042  }
3043 
3044  // It is common for the tokens immediately after a /**/ comment to be
3045  // whitespace. Instead of going through the big switch, handle it
3046  // efficiently now. This is safe even in KeepWhitespaceMode because we would
3047  // have already returned above with the comment as a token.
3048  if (isHorizontalWhitespace(*CurPtr)) {
3049  SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
3050  return false;
3051  }
3052 
3053  // Otherwise, just return so that the next character will be lexed as a token.
3054  BufferPtr = CurPtr;
3055  Result.setFlag(Token::LeadingSpace);
3056  return false;
3057 }
3058 
3059 //===----------------------------------------------------------------------===//
3060 // Primary Lexing Entry Points
3061 //===----------------------------------------------------------------------===//
3062 
3063 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3064 /// uninterpreted string. This switches the lexer out of directive mode.
3066  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3067  "Must be in a preprocessing directive!");
3068  Token Tmp;
3069  Tmp.startToken();
3070 
3071  // CurPtr - Cache BufferPtr in an automatic variable.
3072  const char *CurPtr = BufferPtr;
3073  while (true) {
3074  char Char = getAndAdvanceChar(CurPtr, Tmp);
3075  switch (Char) {
3076  default:
3077  if (Result)
3078  Result->push_back(Char);
3079  break;
3080  case 0: // Null.
3081  // Found end of file?
3082  if (CurPtr-1 != BufferEnd) {
3083  if (isCodeCompletionPoint(CurPtr-1)) {
3085  cutOffLexing();
3086  return;
3087  }
3088 
3089  // Nope, normal character, continue.
3090  if (Result)
3091  Result->push_back(Char);
3092  break;
3093  }
3094  // FALL THROUGH.
3095  [[fallthrough]];
3096  case '\r':
3097  case '\n':
3098  // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3099  assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3100  BufferPtr = CurPtr-1;
3101 
3102  // Next, lex the character, which should handle the EOD transition.
3103  Lex(Tmp);
3104  if (Tmp.is(tok::code_completion)) {
3105  if (PP)
3107  Lex(Tmp);
3108  }
3109  assert(Tmp.is(tok::eod) && "Unexpected token!");
3110 
3111  // Finally, we're done;
3112  return;
3113  }
3114  }
3115 }
3116 
3117 /// LexEndOfFile - CurPtr points to the end of this file. Handle this
3118 /// condition, reporting diagnostics and handling other edge cases as required.
3119 /// This returns true if Result contains a token, false if PP.Lex should be
3120 /// called again.
3121 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3122  // If we hit the end of the file while parsing a preprocessor directive,
3123  // end the preprocessor directive first. The next token returned will
3124  // then be the end of file.
3126  // Done parsing the "line".
3128  // Update the location of token as well as BufferPtr.
3129  FormTokenWithChars(Result, CurPtr, tok::eod);
3130 
3131  // Restore comment saving mode, in case it was disabled for directive.
3132  if (PP)
3134  return true; // Have a token.
3135  }
3136 
3137  // If we are in raw mode, return this event as an EOF token. Let the caller
3138  // that put us in raw mode handle the event.
3139  if (isLexingRawMode()) {
3140  Result.startToken();
3141  BufferPtr = BufferEnd;
3142  FormTokenWithChars(Result, BufferEnd, tok::eof);
3143  return true;
3144  }
3145 
3146  if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3148  // If the preamble cuts off the end of a header guard, consider it guarded.
3149  // The guard is valid for the preamble content itself, and for tools the
3150  // most useful answer is "yes, this file has a header guard".
3151  if (!ConditionalStack.empty())
3153  ConditionalStack.clear();
3154  }
3155 
3156  // Issue diagnostics for unterminated #if and missing newline.
3157 
3158  // If we are in a #if directive, emit an error.
3159  while (!ConditionalStack.empty()) {
3160  if (PP->getCodeCompletionFileLoc() != FileLoc)
3161  PP->Diag(ConditionalStack.back().IfLoc,
3162  diag::err_pp_unterminated_conditional);
3163  ConditionalStack.pop_back();
3164  }
3165 
3166  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3167  // a pedwarn.
3168  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3169  DiagnosticsEngine &Diags = PP->getDiagnostics();
3170  SourceLocation EndLoc = getSourceLocation(BufferEnd);
3171  unsigned DiagID;
3172 
3173  if (LangOpts.CPlusPlus11) {
3174  // C++11 [lex.phases] 2.2 p2
3175  // Prefer the C++98 pedantic compatibility warning over the generic,
3176  // non-extension, user-requested "missing newline at EOF" warning.
3177  if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3178  DiagID = diag::warn_cxx98_compat_no_newline_eof;
3179  } else {
3180  DiagID = diag::warn_no_newline_eof;
3181  }
3182  } else {
3183  DiagID = diag::ext_no_newline_eof;
3184  }
3185 
3186  Diag(BufferEnd, DiagID)
3187  << FixItHint::CreateInsertion(EndLoc, "\n");
3188  }
3189 
3190  BufferPtr = CurPtr;
3191 
3192  // Finally, let the preprocessor handle this.
3193  return PP->HandleEndOfFile(Result, isPragmaLexer());
3194 }
3195 
3196 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3197 /// the specified lexer will return a tok::l_paren token, 0 if it is something
3198 /// else and 2 if there are no more tokens in the buffer controlled by the
3199 /// lexer.
3200 unsigned Lexer::isNextPPTokenLParen() {
3201  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3202 
3203  if (isDependencyDirectivesLexer()) {
3204  if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3205  return 2;
3206  return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3207  tok::l_paren);
3208  }
3209 
3210  // Switch to 'skipping' mode. This will ensure that we can lex a token
3211  // without emitting diagnostics, disables macro expansion, and will cause EOF
3212  // to return an EOF token instead of popping the include stack.
3213  LexingRawMode = true;
3214 
3215  // Save state that can be changed while lexing so that we can restore it.
3216  const char *TmpBufferPtr = BufferPtr;
3217  bool inPPDirectiveMode = ParsingPreprocessorDirective;
3218  bool atStartOfLine = IsAtStartOfLine;
3219  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3220  bool leadingSpace = HasLeadingSpace;
3221 
3222  Token Tok;
3223  Lex(Tok);
3224 
3225  // Restore state that may have changed.
3226  BufferPtr = TmpBufferPtr;
3227  ParsingPreprocessorDirective = inPPDirectiveMode;
3228  HasLeadingSpace = leadingSpace;
3229  IsAtStartOfLine = atStartOfLine;
3230  IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3231 
3232  // Restore the lexer back to non-skipping mode.
3233  LexingRawMode = false;
3234 
3235  if (Tok.is(tok::eof))
3236  return 2;
3237  return Tok.is(tok::l_paren);
3238 }
3239 
3240 /// Find the end of a version control conflict marker.
3241 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3242  ConflictMarkerKind CMK) {
3243  const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3244  size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3245  auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3246  size_t Pos = RestOfBuffer.find(Terminator);
3247  while (Pos != StringRef::npos) {
3248  // Must occur at start of line.
3249  if (Pos == 0 ||
3250  (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3251  RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3252  Pos = RestOfBuffer.find(Terminator);
3253  continue;
3254  }
3255  return RestOfBuffer.data()+Pos;
3256  }
3257  return nullptr;
3258 }
3259 
3260 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
3261 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3262 /// and recover nicely. This returns true if it is a conflict marker and false
3263 /// if not.
3264 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3265  // Only a conflict marker if it starts at the beginning of a line.
3266  if (CurPtr != BufferStart &&
3267  CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3268  return false;
3269 
3270  // Check to see if we have <<<<<<< or >>>>.
3271  if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3272  !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3273  return false;
3274 
3275  // If we have a situation where we don't care about conflict markers, ignore
3276  // it.
3277  if (CurrentConflictMarkerState || isLexingRawMode())
3278  return false;
3279 
3280  ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3281 
3282  // Check to see if there is an ending marker somewhere in the buffer at the
3283  // start of a line to terminate this conflict marker.
3284  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3285  // We found a match. We are really in a conflict marker.
3286  // Diagnose this, and ignore to the end of line.
3287  Diag(CurPtr, diag::err_conflict_marker);
3288  CurrentConflictMarkerState = Kind;
3289 
3290  // Skip ahead to the end of line. We know this exists because the
3291  // end-of-conflict marker starts with \r or \n.
3292  while (*CurPtr != '\r' && *CurPtr != '\n') {
3293  assert(CurPtr != BufferEnd && "Didn't find end of line");
3294  ++CurPtr;
3295  }
3296  BufferPtr = CurPtr;
3297  return true;
3298  }
3299 
3300  // No end of conflict marker found.
3301  return false;
3302 }
3303 
3304 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3305 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3306 /// is the end of a conflict marker. Handle it by ignoring up until the end of
3307 /// the line. This returns true if it is a conflict marker and false if not.
3308 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3309  // Only a conflict marker if it starts at the beginning of a line.
3310  if (CurPtr != BufferStart &&
3311  CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3312  return false;
3313 
3314  // If we have a situation where we don't care about conflict markers, ignore
3315  // it.
3316  if (!CurrentConflictMarkerState || isLexingRawMode())
3317  return false;
3318 
3319  // Check to see if we have the marker (4 characters in a row).
3320  for (unsigned i = 1; i != 4; ++i)
3321  if (CurPtr[i] != CurPtr[0])
3322  return false;
3323 
3324  // If we do have it, search for the end of the conflict marker. This could
3325  // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
3326  // be the end of conflict marker.
3327  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3328  CurrentConflictMarkerState)) {
3329  CurPtr = End;
3330 
3331  // Skip ahead to the end of line.
3332  while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3333  ++CurPtr;
3334 
3335  BufferPtr = CurPtr;
3336 
3337  // No longer in the conflict marker.
3338  CurrentConflictMarkerState = CMK_None;
3339  return true;
3340  }
3341 
3342  return false;
3343 }
3344 
3345 static const char *findPlaceholderEnd(const char *CurPtr,
3346  const char *BufferEnd) {
3347  if (CurPtr == BufferEnd)
3348  return nullptr;
3349  BufferEnd -= 1; // Scan until the second last character.
3350  for (; CurPtr != BufferEnd; ++CurPtr) {
3351  if (CurPtr[0] == '#' && CurPtr[1] == '>')
3352  return CurPtr + 2;
3353  }
3354  return nullptr;
3355 }
3356 
3357 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3358  assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3360  return false;
3361  const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3362  if (!End)
3363  return false;
3364  const char *Start = CurPtr - 1;
3365  if (!LangOpts.AllowEditorPlaceholders)
3366  Diag(Start, diag::err_placeholder_in_source);
3367  Result.startToken();
3368  FormTokenWithChars(Result, End, tok::raw_identifier);
3369  Result.setRawIdentifierData(Start);
3370  PP->LookUpIdentifierInfo(Result);
3371  Result.setFlag(Token::IsEditorPlaceholder);
3372  BufferPtr = End;
3373  return true;
3374 }
3375 
3376 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3377  if (PP && PP->isCodeCompletionEnabled()) {
3378  SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3379  return Loc == PP->getCodeCompletionLoc();
3380  }
3381 
3382  return false;
3383 }
3384 
3385 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3386  const char *SlashLoc,
3387  Token *Result) {
3388  unsigned CharSize;
3389  char Kind = getCharAndSize(StartPtr, CharSize);
3390  assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3391 
3392  unsigned NumHexDigits;
3393  if (Kind == 'u')
3394  NumHexDigits = 4;
3395  else if (Kind == 'U')
3396  NumHexDigits = 8;
3397 
3398  bool Delimited = false;
3399  bool FoundEndDelimiter = false;
3400  unsigned Count = 0;
3401  bool Diagnose = Result && !isLexingRawMode();
3402 
3403  if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3404  if (Diagnose)
3405  Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3406  return std::nullopt;
3407  }
3408 
3409  const char *CurPtr = StartPtr + CharSize;
3410  const char *KindLoc = &CurPtr[-1];
3411 
3412  uint32_t CodePoint = 0;
3413  while (Count != NumHexDigits || Delimited) {
3414  char C = getCharAndSize(CurPtr, CharSize);
3415  if (!Delimited && Count == 0 && C == '{') {
3416  Delimited = true;
3417  CurPtr += CharSize;
3418  continue;
3419  }
3420 
3421  if (Delimited && C == '}') {
3422  CurPtr += CharSize;
3423  FoundEndDelimiter = true;
3424  break;
3425  }
3426 
3427  unsigned Value = llvm::hexDigitValue(C);
3428  if (Value == -1U) {
3429  if (!Delimited)
3430  break;
3431  if (Diagnose)
3432  Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3433  << StringRef(KindLoc, 1);
3434  return std::nullopt;
3435  }
3436 
3437  if (CodePoint & 0xF000'0000) {
3438  if (Diagnose)
3439  Diag(KindLoc, diag::err_escape_too_large) << 0;
3440  return std::nullopt;
3441  }
3442 
3443  CodePoint <<= 4;
3444  CodePoint |= Value;
3445  CurPtr += CharSize;
3446  Count++;
3447  }
3448 
3449  if (Count == 0) {
3450  if (Diagnose)
3451  Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3452  : diag::warn_ucn_escape_no_digits)
3453  << StringRef(KindLoc, 1);
3454  return std::nullopt;
3455  }
3456 
3457  if (Delimited && Kind == 'U') {
3458  if (Diagnose)
3459  Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3460  return std::nullopt;
3461  }
3462 
3463  if (!Delimited && Count != NumHexDigits) {
3464  if (Diagnose) {
3465  Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3466  // If the user wrote \U1234, suggest a fixit to \u.
3467  if (Count == 4 && NumHexDigits == 8) {
3468  CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3469  Diag(KindLoc, diag::note_ucn_four_not_eight)
3470  << FixItHint::CreateReplacement(URange, "u");
3471  }
3472  }
3473  return std::nullopt;
3474  }
3475 
3476  if (Delimited && PP) {
3477  Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3478  ? diag::warn_cxx23_delimited_escape_sequence
3479  : diag::ext_delimited_escape_sequence)
3480  << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3481  }
3482 
3483  if (Result) {
3484  Result->setFlag(Token::HasUCN);
3485  // If the UCN contains either a trigraph or a line splicing,
3486  // we need to call getAndAdvanceChar again to set the appropriate flags
3487  // on Result.
3488  if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3489  StartPtr = CurPtr;
3490  else
3491  while (StartPtr != CurPtr)
3492  (void)getAndAdvanceChar(StartPtr, *Result);
3493  } else {
3494  StartPtr = CurPtr;
3495  }
3496  return CodePoint;
3497 }
3498 
3499 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3500  const char *SlashLoc,
3501  Token *Result) {
3502  unsigned CharSize;
3503  bool Diagnose = Result && !isLexingRawMode();
3504 
3505  char C = getCharAndSize(StartPtr, CharSize);
3506  assert(C == 'N' && "expected \\N{...}");
3507 
3508  const char *CurPtr = StartPtr + CharSize;
3509  const char *KindLoc = &CurPtr[-1];
3510 
3511  C = getCharAndSize(CurPtr, CharSize);
3512  if (C != '{') {
3513  if (Diagnose)
3514  Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3515  return std::nullopt;
3516  }
3517  CurPtr += CharSize;
3518  const char *StartName = CurPtr;
3519  bool FoundEndDelimiter = false;
3521  while (C) {
3522  C = getCharAndSize(CurPtr, CharSize);
3523  CurPtr += CharSize;
3524  if (C == '}') {
3525  FoundEndDelimiter = true;
3526  break;
3527  }
3528 
3529  if (isVerticalWhitespace(C))
3530  break;
3531  Buffer.push_back(C);
3532  }
3533 
3534  if (!FoundEndDelimiter || Buffer.empty()) {
3535  if (Diagnose)
3536  Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3537  : diag::warn_delimited_ucn_incomplete)
3538  << StringRef(KindLoc, 1);
3539  return std::nullopt;
3540  }
3541 
3542  StringRef Name(Buffer.data(), Buffer.size());
3543  std::optional<char32_t> Match =
3544  llvm::sys::unicode::nameToCodepointStrict(Name);
3545  std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3546  if (!Match) {
3547  LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3548  if (Diagnose) {
3549  Diag(StartName, diag::err_invalid_ucn_name)
3550  << StringRef(Buffer.data(), Buffer.size())
3551  << makeCharRange(*this, StartName, CurPtr - CharSize);
3552  if (LooseMatch) {
3553  Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3555  makeCharRange(*this, StartName, CurPtr - CharSize),
3556  LooseMatch->Name);
3557  }
3558  }
3559  // We do not offer misspelled character names suggestions here
3560  // as the set of what would be a valid suggestion depends on context,
3561  // and we should not make invalid suggestions.
3562  }
3563 
3564  if (Diagnose && Match)
3565  Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3566  ? diag::warn_cxx23_delimited_escape_sequence
3567  : diag::ext_delimited_escape_sequence)
3568  << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3569 
3570  // If no diagnostic has been emitted yet, likely because we are doing a
3571  // tentative lexing, we do not want to recover here to make sure the token
3572  // will not be incorrectly considered valid. This function will be called
3573  // again and a diagnostic emitted then.
3574  if (LooseMatch && Diagnose)
3575  Match = LooseMatch->CodePoint;
3576 
3577  if (Result) {
3578  Result->setFlag(Token::HasUCN);
3579  // If the UCN contains either a trigraph or a line splicing,
3580  // we need to call getAndAdvanceChar again to set the appropriate flags
3581  // on Result.
3582  if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3583  StartPtr = CurPtr;
3584  else
3585  while (StartPtr != CurPtr)
3586  (void)getAndAdvanceChar(StartPtr, *Result);
3587  } else {
3588  StartPtr = CurPtr;
3589  }
3590  return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3591 }
3592 
3593 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3594  Token *Result) {
3595 
3596  unsigned CharSize;
3597  std::optional<uint32_t> CodePointOpt;
3598  char Kind = getCharAndSize(StartPtr, CharSize);
3599  if (Kind == 'u' || Kind == 'U')
3600  CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3601  else if (Kind == 'N')
3602  CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3603 
3604  if (!CodePointOpt)
3605  return 0;
3606 
3607  uint32_t CodePoint = *CodePointOpt;
3608 
3609  // Don't apply C family restrictions to UCNs in assembly mode
3610  if (LangOpts.AsmPreprocessor)
3611  return CodePoint;
3612 
3613  // C23 6.4.3p2: A universal character name shall not designate a code point
3614  // where the hexadecimal value is:
3615  // - in the range D800 through DFFF inclusive; or
3616  // - greater than 10FFFF.
3617  // A universal-character-name outside the c-char-sequence of a character
3618  // constant, or the s-char-sequence of a string-literal shall not designate
3619  // a control character or a character in the basic character set.
3620 
3621  // C++11 [lex.charset]p2: If the hexadecimal value for a
3622  // universal-character-name corresponds to a surrogate code point (in the
3623  // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3624  // if the hexadecimal value for a universal-character-name outside the
3625  // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3626  // string literal corresponds to a control character (in either of the
3627  // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3628  // basic source character set, the program is ill-formed.
3629  if (CodePoint < 0xA0) {
3630  // We don't use isLexingRawMode() here because we need to warn about bad
3631  // UCNs even when skipping preprocessing tokens in a #if block.
3632  if (Result && PP) {
3633  if (CodePoint < 0x20 || CodePoint >= 0x7F)
3634  Diag(BufferPtr, diag::err_ucn_control_character);
3635  else {
3636  char C = static_cast<char>(CodePoint);
3637  Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3638  }
3639  }
3640 
3641  return 0;
3642  } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3643  // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3644  // We don't use isLexingRawMode() here because we need to diagnose bad
3645  // UCNs even when skipping preprocessing tokens in a #if block.
3646  if (Result && PP) {
3647  if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3648  Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3649  else
3650  Diag(BufferPtr, diag::err_ucn_escape_invalid);
3651  }
3652  return 0;
3653  }
3654 
3655  return CodePoint;
3656 }
3657 
3658 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3659  const char *CurPtr) {
3660  if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3661  isUnicodeWhitespace(C)) {
3662  Diag(BufferPtr, diag::ext_unicode_whitespace)
3663  << makeCharRange(*this, BufferPtr, CurPtr);
3664 
3665  Result.setFlag(Token::LeadingSpace);
3666  return true;
3667  }
3668  return false;
3669 }
3670 
3671 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3672  IsAtStartOfLine = Result.isAtStartOfLine();
3673  HasLeadingSpace = Result.hasLeadingSpace();
3674  HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3675  // Note that this doesn't affect IsAtPhysicalStartOfLine.
3676 }
3677 
3678 bool Lexer::Lex(Token &Result) {
3679  assert(!isDependencyDirectivesLexer());
3680 
3681  // Start a new token.
3682  Result.startToken();
3683 
3684  // Set up misc whitespace flags for LexTokenInternal.
3685  if (IsAtStartOfLine) {
3686  Result.setFlag(Token::StartOfLine);
3687  IsAtStartOfLine = false;
3688  }
3689 
3690  if (HasLeadingSpace) {
3691  Result.setFlag(Token::LeadingSpace);
3692  HasLeadingSpace = false;
3693  }
3694 
3695  if (HasLeadingEmptyMacro) {
3696  Result.setFlag(Token::LeadingEmptyMacro);
3697  HasLeadingEmptyMacro = false;
3698  }
3699 
3700  bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3701  IsAtPhysicalStartOfLine = false;
3702  bool isRawLex = isLexingRawMode();
3703  (void) isRawLex;
3704  bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3705  // (After the LexTokenInternal call, the lexer might be destroyed.)
3706  assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3707  return returnedToken;
3708 }
3709 
3710 /// LexTokenInternal - This implements a simple C family lexer. It is an
3711 /// extremely performance critical piece of code. This assumes that the buffer
3712 /// has a null character at the end of the file. This returns a preprocessing
3713 /// token, not a normal token, as such, it is an internal interface. It assumes
3714 /// that the Flags of result have been cleared before calling this.
3715 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3716 LexStart:
3717  assert(!Result.needsCleaning() && "Result needs cleaning");
3718  assert(!Result.hasPtrData() && "Result has not been reset");
3719 
3720  // CurPtr - Cache BufferPtr in an automatic variable.
3721  const char *CurPtr = BufferPtr;
3722 
3723  // Small amounts of horizontal whitespace is very common between tokens.
3724  if (isHorizontalWhitespace(*CurPtr)) {
3725  do {
3726  ++CurPtr;
3727  } while (isHorizontalWhitespace(*CurPtr));
3728 
3729  // If we are keeping whitespace and other tokens, just return what we just
3730  // skipped. The next lexer invocation will return the token after the
3731  // whitespace.
3732  if (isKeepWhitespaceMode()) {
3733  FormTokenWithChars(Result, CurPtr, tok::unknown);
3734  // FIXME: The next token will not have LeadingSpace set.
3735  return true;
3736  }
3737 
3738  BufferPtr = CurPtr;
3739  Result.setFlag(Token::LeadingSpace);
3740  }
3741 
3742  unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
3743 
3744  // Read a character, advancing over it.
3745  char Char = getAndAdvanceChar(CurPtr, Result);
3747 
3748  if (!isVerticalWhitespace(Char))
3749  NewLinePtr = nullptr;
3750 
3751  switch (Char) {
3752  case 0: // Null.
3753  // Found end of file?
3754  if (CurPtr-1 == BufferEnd)
3755  return LexEndOfFile(Result, CurPtr-1);
3756 
3757  // Check if we are performing code completion.
3758  if (isCodeCompletionPoint(CurPtr-1)) {
3759  // Return the code-completion token.
3760  Result.startToken();
3761  FormTokenWithChars(Result, CurPtr, tok::code_completion);
3762  return true;
3763  }
3764 
3765  if (!isLexingRawMode())
3766  Diag(CurPtr-1, diag::null_in_file);
3767  Result.setFlag(Token::LeadingSpace);
3768  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3769  return true; // KeepWhitespaceMode
3770 
3771  // We know the lexer hasn't changed, so just try again with this lexer.
3772  // (We manually eliminate the tail call to avoid recursion.)
3773  goto LexNextToken;
3774 
3775  case 26: // DOS & CP/M EOF: "^Z".
3776  // If we're in Microsoft extensions mode, treat this as end of file.
3777  if (LangOpts.MicrosoftExt) {
3778  if (!isLexingRawMode())
3779  Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3780  return LexEndOfFile(Result, CurPtr-1);
3781  }
3782 
3783  // If Microsoft extensions are disabled, this is just random garbage.
3784  Kind = tok::unknown;
3785  break;
3786 
3787  case '\r':
3788  if (CurPtr[0] == '\n')
3789  (void)getAndAdvanceChar(CurPtr, Result);
3790  [[fallthrough]];
3791  case '\n':
3792  // If we are inside a preprocessor directive and we see the end of line,
3793  // we know we are done with the directive, so return an EOD token.
3795  // Done parsing the "line".
3797 
3798  // Restore comment saving mode, in case it was disabled for directive.
3799  if (PP)
3801 
3802  // Since we consumed a newline, we are back at the start of a line.
3803  IsAtStartOfLine = true;
3804  IsAtPhysicalStartOfLine = true;
3805  NewLinePtr = CurPtr - 1;
3806 
3807  Kind = tok::eod;
3808  break;
3809  }
3810 
3811  // No leading whitespace seen so far.
3812  Result.clearFlag(Token::LeadingSpace);
3813 
3814  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3815  return true; // KeepWhitespaceMode
3816 
3817  // We only saw whitespace, so just try again with this lexer.
3818  // (We manually eliminate the tail call to avoid recursion.)
3819  goto LexNextToken;
3820  case ' ':
3821  case '\t':
3822  case '\f':
3823  case '\v':
3824  SkipHorizontalWhitespace:
3825  Result.setFlag(Token::LeadingSpace);
3826  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3827  return true; // KeepWhitespaceMode
3828 
3829  SkipIgnoredUnits:
3830  CurPtr = BufferPtr;
3831 
3832  // If the next token is obviously a // or /* */ comment, skip it efficiently
3833  // too (without going through the big switch stmt).
3834  if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3835  LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3836  if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3837  return true; // There is a token to return.
3838  goto SkipIgnoredUnits;
3839  } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3840  if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3841  return true; // There is a token to return.
3842  goto SkipIgnoredUnits;
3843  } else if (isHorizontalWhitespace(*CurPtr)) {
3844  goto SkipHorizontalWhitespace;
3845  }
3846  // We only saw whitespace, so just try again with this lexer.
3847  // (We manually eliminate the tail call to avoid recursion.)
3848  goto LexNextToken;
3849 
3850  // C99 6.4.4.1: Integer Constants.
3851  // C99 6.4.4.2: Floating Constants.
3852  case '0': case '1': case '2': case '3': case '4':
3853  case '5': case '6': case '7': case '8': case '9':
3854  // Notify MIOpt that we read a non-whitespace/non-comment token.
3855  MIOpt.ReadToken();
3856  return LexNumericConstant(Result, CurPtr);
3857 
3858  // Identifier (e.g., uber), or
3859  // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3860  // UTF-8 or UTF-16 string literal (C11/C++11).
3861  case 'u':
3862  // Notify MIOpt that we read a non-whitespace/non-comment token.
3863  MIOpt.ReadToken();
3864 
3865  if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3866  Char = getCharAndSize(CurPtr, SizeTmp);
3867 
3868  // UTF-16 string literal
3869  if (Char == '"')
3870  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3871  tok::utf16_string_literal);
3872 
3873  // UTF-16 character constant
3874  if (Char == '\'')
3875  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3876  tok::utf16_char_constant);
3877 
3878  // UTF-16 raw string literal
3879  if (Char == 'R' && LangOpts.CPlusPlus11 &&
3880  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3881  return LexRawStringLiteral(Result,
3882  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3883  SizeTmp2, Result),
3884  tok::utf16_string_literal);
3885 
3886  if (Char == '8') {
3887  char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3888 
3889  // UTF-8 string literal
3890  if (Char2 == '"')
3891  return LexStringLiteral(Result,
3892  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3893  SizeTmp2, Result),
3894  tok::utf8_string_literal);
3895  if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3896  return LexCharConstant(
3897  Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3898  SizeTmp2, Result),
3899  tok::utf8_char_constant);
3900 
3901  if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3902  unsigned SizeTmp3;
3903  char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3904  // UTF-8 raw string literal
3905  if (Char3 == '"') {
3906  return LexRawStringLiteral(Result,
3907  ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3908  SizeTmp2, Result),
3909  SizeTmp3, Result),
3910  tok::utf8_string_literal);
3911  }
3912  }
3913  }
3914  }
3915 
3916  // treat u like the start of an identifier.
3917  return LexIdentifierContinue(Result, CurPtr);
3918 
3919  case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3920  // Notify MIOpt that we read a non-whitespace/non-comment token.
3921  MIOpt.ReadToken();
3922 
3923  if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3924  Char = getCharAndSize(CurPtr, SizeTmp);
3925 
3926  // UTF-32 string literal
3927  if (Char == '"')
3928  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3929  tok::utf32_string_literal);
3930 
3931  // UTF-32 character constant
3932  if (Char == '\'')
3933  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3934  tok::utf32_char_constant);
3935 
3936  // UTF-32 raw string literal
3937  if (Char == 'R' && LangOpts.CPlusPlus11 &&
3938  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3939  return LexRawStringLiteral(Result,
3940  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3941  SizeTmp2, Result),
3942  tok::utf32_string_literal);
3943  }
3944 
3945  // treat U like the start of an identifier.
3946  return LexIdentifierContinue(Result, CurPtr);
3947 
3948  case 'R': // Identifier or C++0x raw string literal
3949  // Notify MIOpt that we read a non-whitespace/non-comment token.
3950  MIOpt.ReadToken();
3951 
3952  if (LangOpts.CPlusPlus11) {
3953  Char = getCharAndSize(CurPtr, SizeTmp);
3954 
3955  if (Char == '"')
3956  return LexRawStringLiteral(Result,
3957  ConsumeChar(CurPtr, SizeTmp, Result),
3958  tok::string_literal);
3959  }
3960 
3961  // treat R like the start of an identifier.
3962  return LexIdentifierContinue(Result, CurPtr);
3963 
3964  case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
3965  // Notify MIOpt that we read a non-whitespace/non-comment token.
3966  MIOpt.ReadToken();
3967  Char = getCharAndSize(CurPtr, SizeTmp);
3968 
3969  // Wide string literal.
3970  if (Char == '"')
3971  return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3972  tok::wide_string_literal);
3973 
3974  // Wide raw string literal.
3975  if (LangOpts.CPlusPlus11 && Char == 'R' &&
3976  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3977  return LexRawStringLiteral(Result,
3978  ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3979  SizeTmp2, Result),
3980  tok::wide_string_literal);
3981 
3982  // Wide character constant.
3983  if (Char == '\'')
3984  return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3985  tok::wide_char_constant);
3986  // FALL THROUGH, treating L like the start of an identifier.
3987  [[fallthrough]];
3988 
3989  // C99 6.4.2: Identifiers.
3990  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3991  case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
3992  case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
3993  case 'V': case 'W': case 'X': case 'Y': case 'Z':
3994  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3995  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3996  case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
3997  case 'v': case 'w': case 'x': case 'y': case 'z':
3998  case '_':
3999  // Notify MIOpt that we read a non-whitespace/non-comment token.
4000  MIOpt.ReadToken();
4001  return LexIdentifierContinue(Result, CurPtr);
4002 
4003  case '$': // $ in identifiers.
4004  if (LangOpts.DollarIdents) {
4005  if (!isLexingRawMode())
4006  Diag(CurPtr-1, diag::ext_dollar_in_identifier);
4007  // Notify MIOpt that we read a non-whitespace/non-comment token.
4008  MIOpt.ReadToken();
4009  return LexIdentifierContinue(Result, CurPtr);
4010  }
4011 
4012  Kind = tok::unknown;
4013  break;
4014 
4015  // C99 6.4.4: Character Constants.
4016  case '\'':
4017  // Notify MIOpt that we read a non-whitespace/non-comment token.
4018  MIOpt.ReadToken();
4019  return LexCharConstant(Result, CurPtr, tok::char_constant);
4020 
4021  // C99 6.4.5: String Literals.
4022  case '"':
4023  // Notify MIOpt that we read a non-whitespace/non-comment token.
4024  MIOpt.ReadToken();
4025  return LexStringLiteral(Result, CurPtr,
4026  ParsingFilename ? tok::header_name
4027  : tok::string_literal);
4028 
4029  // C99 6.4.6: Punctuators.
4030  case '?':
4031  Kind = tok::question;
4032  break;
4033  case '[':
4034  Kind = tok::l_square;
4035  break;
4036  case ']':
4037  Kind = tok::r_square;
4038  break;
4039  case '(':
4040  Kind = tok::l_paren;
4041  break;
4042  case ')':
4043  Kind = tok::r_paren;
4044  break;
4045  case '{':
4046  Kind = tok::l_brace;
4047  break;
4048  case '}':
4049  Kind = tok::r_brace;
4050  break;
4051  case '.':
4052  Char = getCharAndSize(CurPtr, SizeTmp);
4053  if (Char >= '0' && Char <= '9') {
4054  // Notify MIOpt that we read a non-whitespace/non-comment token.
4055  MIOpt.ReadToken();
4056 
4057  return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4058  } else if (LangOpts.CPlusPlus && Char == '*') {
4059  Kind = tok::periodstar;
4060  CurPtr += SizeTmp;
4061  } else if (Char == '.' &&
4062  getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4063  Kind = tok::ellipsis;
4064  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4065  SizeTmp2, Result);
4066  } else {
4067  Kind = tok::period;
4068  }
4069  break;
4070  case '&':
4071  Char = getCharAndSize(CurPtr, SizeTmp);
4072  if (Char == '&') {
4073  Kind = tok::ampamp;
4074  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4075  } else if (Char == '=') {
4076  Kind = tok::ampequal;
4077  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4078  } else {
4079  Kind = tok::amp;
4080  }
4081  break;
4082  case '*':
4083  if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4084  Kind = tok::starequal;
4085  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4086  } else {
4087  Kind = tok::star;
4088  }
4089  break;
4090  case '+':
4091  Char = getCharAndSize(CurPtr, SizeTmp);
4092  if (Char == '+') {
4093  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4094  Kind = tok::plusplus;
4095  } else if (Char == '=') {
4096  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4097  Kind = tok::plusequal;
4098  } else {
4099  Kind = tok::plus;
4100  }
4101  break;
4102  case '-':
4103  Char = getCharAndSize(CurPtr, SizeTmp);
4104  if (Char == '-') { // --
4105  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4106  Kind = tok::minusminus;
4107  } else if (Char == '>' && LangOpts.CPlusPlus &&
4108  getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
4109  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4110  SizeTmp2, Result);
4111  Kind = tok::arrowstar;
4112  } else if (Char == '>') { // ->
4113  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4114  Kind = tok::arrow;
4115  } else if (Char == '=') { // -=
4116  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4117  Kind = tok::minusequal;
4118  } else {
4119  Kind = tok::minus;
4120  }
4121  break;
4122  case '~':
4123  Kind = tok::tilde;
4124  break;
4125  case '!':
4126  if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4127  Kind = tok::exclaimequal;
4128  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4129  } else {
4130  Kind = tok::exclaim;
4131  }
4132  break;
4133  case '/':
4134  // 6.4.9: Comments
4135  Char = getCharAndSize(CurPtr, SizeTmp);
4136  if (Char == '/') { // Line comment.
4137  // Even if Line comments are disabled (e.g. in C89 mode), we generally
4138  // want to lex this as a comment. There is one problem with this though,
4139  // that in one particular corner case, this can change the behavior of the
4140  // resultant program. For example, In "foo //**/ bar", C89 would lex
4141  // this as "foo / bar" and languages with Line comments would lex it as
4142  // "foo". Check to see if the character after the second slash is a '*'.
4143  // If so, we will lex that as a "/" instead of the start of a comment.
4144  // However, we never do this if we are just preprocessing.
4145  bool TreatAsComment =
4146  LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4147  if (!TreatAsComment)
4148  if (!(PP && PP->isPreprocessedOutput()))
4149  TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4150 
4151  if (TreatAsComment) {
4152  if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4153  TokAtPhysicalStartOfLine))
4154  return true; // There is a token to return.
4155 
4156  // It is common for the tokens immediately after a // comment to be
4157  // whitespace (indentation for the next line). Instead of going through
4158  // the big switch, handle it efficiently now.
4159  goto SkipIgnoredUnits;
4160  }
4161  }
4162 
4163  if (Char == '*') { // /**/ comment.
4164  if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4165  TokAtPhysicalStartOfLine))
4166  return true; // There is a token to return.
4167 
4168  // We only saw whitespace, so just try again with this lexer.
4169  // (We manually eliminate the tail call to avoid recursion.)
4170  goto LexNextToken;
4171  }
4172 
4173  if (Char == '=') {
4174  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4175  Kind = tok::slashequal;
4176  } else {
4177  Kind = tok::slash;
4178  }
4179  break;
4180  case '%':
4181  Char = getCharAndSize(CurPtr, SizeTmp);
4182  if (Char == '=') {
4183  Kind = tok::percentequal;
4184  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4185  } else if (LangOpts.Digraphs && Char == '>') {
4186  Kind = tok::r_brace; // '%>' -> '}'
4187  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4188  } else if (LangOpts.Digraphs && Char == ':') {
4189  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4190  Char = getCharAndSize(CurPtr, SizeTmp);
4191  if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4192  Kind = tok::hashhash; // '%:%:' -> '##'
4193  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4194  SizeTmp2, Result);
4195  } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4196  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4197  if (!isLexingRawMode())
4198  Diag(BufferPtr, diag::ext_charize_microsoft);
4199  Kind = tok::hashat;
4200  } else { // '%:' -> '#'
4201  // We parsed a # character. If this occurs at the start of the line,
4202  // it's actually the start of a preprocessing directive. Callback to
4203  // the preprocessor to handle it.
4204  // TODO: -fpreprocessed mode??
4205  if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4206  goto HandleDirective;
4207 
4208  Kind = tok::hash;
4209  }
4210  } else {
4211  Kind = tok::percent;
4212  }
4213  break;
4214  case '<':
4215  Char = getCharAndSize(CurPtr, SizeTmp);
4216  if (ParsingFilename) {
4217  return LexAngledStringLiteral(Result, CurPtr);
4218  } else if (Char == '<') {
4219  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4220  if (After == '=') {
4221  Kind = tok::lesslessequal;
4222  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4223  SizeTmp2, Result);
4224  } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4225  // If this is actually a '<<<<<<<' version control conflict marker,
4226  // recognize it as such and recover nicely.
4227  goto LexNextToken;
4228  } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4229  // If this is '<<<<' and we're in a Perforce-style conflict marker,
4230  // ignore it.
4231  goto LexNextToken;
4232  } else if (LangOpts.CUDA && After == '<') {
4233  Kind = tok::lesslessless;
4234  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4235  SizeTmp2, Result);
4236  } else {
4237  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4238  Kind = tok::lessless;
4239  }
4240  } else if (Char == '=') {
4241  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4242  if (After == '>') {
4243  if (LangOpts.CPlusPlus20) {
4244  if (!isLexingRawMode())
4245  Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4246  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4247  SizeTmp2, Result);
4248  Kind = tok::spaceship;
4249  break;
4250  }
4251  // Suggest adding a space between the '<=' and the '>' to avoid a
4252  // change in semantics if this turns up in C++ <=17 mode.
4253  if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4254  Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4256  getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4257  }
4258  }
4259  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4260  Kind = tok::lessequal;
4261  } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
4262  if (LangOpts.CPlusPlus11 &&
4263  getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4264  // C++0x [lex.pptoken]p3:
4265  // Otherwise, if the next three characters are <:: and the subsequent
4266  // character is neither : nor >, the < is treated as a preprocessor
4267  // token by itself and not as the first character of the alternative
4268  // token <:.
4269  unsigned SizeTmp3;
4270  char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4271  if (After != ':' && After != '>') {
4272  Kind = tok::less;
4273  if (!isLexingRawMode())
4274  Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4275  break;
4276  }
4277  }
4278 
4279  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4280  Kind = tok::l_square;
4281  } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
4282  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4283  Kind = tok::l_brace;
4284  } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4285  lexEditorPlaceholder(Result, CurPtr)) {
4286  return true;
4287  } else {
4288  Kind = tok::less;
4289  }
4290  break;
4291  case '>':
4292  Char = getCharAndSize(CurPtr, SizeTmp);
4293  if (Char == '=') {
4294  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4295  Kind = tok::greaterequal;
4296  } else if (Char == '>') {
4297  char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4298  if (After == '=') {
4299  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4300  SizeTmp2, Result);
4301  Kind = tok::greatergreaterequal;
4302  } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4303  // If this is actually a '>>>>' conflict marker, recognize it as such
4304  // and recover nicely.
4305  goto LexNextToken;
4306  } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4307  // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4308  goto LexNextToken;
4309  } else if (LangOpts.CUDA && After == '>') {
4310  Kind = tok::greatergreatergreater;
4311  CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4312  SizeTmp2, Result);
4313  } else {
4314  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4315  Kind = tok::greatergreater;
4316  }
4317  } else {
4318  Kind = tok::greater;
4319  }
4320  break;
4321  case '^':
4322  Char = getCharAndSize(CurPtr, SizeTmp);
4323  if (Char == '=') {
4324  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4325  Kind = tok::caretequal;
4326  } else if (LangOpts.OpenCL && Char == '^') {
4327  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4328  Kind = tok::caretcaret;
4329  } else {
4330  Kind = tok::caret;
4331  }
4332  break;
4333  case '|':
4334  Char = getCharAndSize(CurPtr, SizeTmp);
4335  if (Char == '=') {
4336  Kind = tok::pipeequal;
4337  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4338  } else if (Char == '|') {
4339  // If this is '|||||||' and we're in a conflict marker, ignore it.
4340  if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4341  goto LexNextToken;
4342  Kind = tok::pipepipe;
4343  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4344  } else {
4345  Kind = tok::pipe;
4346  }
4347  break;
4348  case ':':
4349  Char = getCharAndSize(CurPtr, SizeTmp);
4350  if (LangOpts.Digraphs && Char == '>') {
4351  Kind = tok::r_square; // ':>' -> ']'
4352  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4353  } else if (Char == ':') {
4354  Kind = tok::coloncolon;
4355  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4356  } else {
4357  Kind = tok::colon;
4358  }
4359  break;
4360  case ';':
4361  Kind = tok::semi;
4362  break;
4363  case '=':
4364  Char = getCharAndSize(CurPtr, SizeTmp);
4365  if (Char == '=') {
4366  // If this is '====' and we're in a conflict marker, ignore it.
4367  if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4368  goto LexNextToken;
4369 
4370  Kind = tok::equalequal;
4371  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4372  } else {
4373  Kind = tok::equal;
4374  }
4375  break;
4376  case ',':
4377  Kind = tok::comma;
4378  break;
4379  case '#':
4380  Char = getCharAndSize(CurPtr, SizeTmp);
4381  if (Char == '#') {
4382  Kind = tok::hashhash;
4383  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4384  } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
4385  Kind = tok::hashat;
4386  if (!isLexingRawMode())
4387  Diag(BufferPtr, diag::ext_charize_microsoft);
4388  CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4389  } else {
4390  // We parsed a # character. If this occurs at the start of the line,
4391  // it's actually the start of a preprocessing directive. Callback to
4392  // the preprocessor to handle it.
4393  // TODO: -fpreprocessed mode??
4394  if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4395  goto HandleDirective;
4396 
4397  Kind = tok::hash;
4398  }
4399  break;
4400 
4401  case '@':
4402  // Objective C support.
4403  if (CurPtr[-1] == '@' && LangOpts.ObjC)
4404  Kind = tok::at;
4405  else
4406  Kind = tok::unknown;
4407  break;
4408 
4409  // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4410  case '\\':
4411  if (!LangOpts.AsmPreprocessor) {
4412  if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4413  if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4414  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4415  return true; // KeepWhitespaceMode
4416 
4417  // We only saw whitespace, so just try again with this lexer.
4418  // (We manually eliminate the tail call to avoid recursion.)
4419  goto LexNextToken;
4420  }
4421 
4422  return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4423  }
4424  }
4425 
4426  Kind = tok::unknown;
4427  break;
4428 
4429  default: {
4430  if (isASCII(Char)) {
4431  Kind = tok::unknown;
4432  break;
4433  }
4434 
4435  llvm::UTF32 CodePoint;
4436 
4437  // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4438  // an escaped newline.
4439  --CurPtr;
4440  llvm::ConversionResult Status =
4441  llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4442  (const llvm::UTF8 *)BufferEnd,
4443  &CodePoint,
4444  llvm::strictConversion);
4445  if (Status == llvm::conversionOK) {
4446  if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4447  if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4448  return true; // KeepWhitespaceMode
4449 
4450  // We only saw whitespace, so just try again with this lexer.
4451  // (We manually eliminate the tail call to avoid recursion.)
4452  goto LexNextToken;
4453  }
4454  return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4455  }
4456 
4458  PP->isPreprocessedOutput()) {
4459  ++CurPtr;
4460  Kind = tok::unknown;
4461  break;
4462  }
4463 
4464  // Non-ASCII characters tend to creep into source code unintentionally.
4465  // Instead of letting the parser complain about the unknown token,
4466  // just diagnose the invalid UTF-8, then drop the character.
4467  Diag(CurPtr, diag::err_invalid_utf8);
4468 
4469  BufferPtr = CurPtr+1;
4470  // We're pretending the character didn't exist, so just try again with
4471  // this lexer.
4472  // (We manually eliminate the tail call to avoid recursion.)
4473  goto LexNextToken;
4474  }
4475  }
4476 
4477  // Notify MIOpt that we read a non-whitespace/non-comment token.
4478  MIOpt.ReadToken();
4479 
4480  // Update the location of token as well as BufferPtr.
4481  FormTokenWithChars(Result, CurPtr, Kind);
4482  return true;
4483 
4484 HandleDirective:
4485  // We parsed a # character and it's the start of a preprocessing directive.
4486 
4487  FormTokenWithChars(Result, CurPtr, tok::hash);
4488  PP->HandleDirective(Result);
4489 
4491  // With a fatal failure in the module loader, we abort parsing.
4492  return true;
4493 
4494  // We parsed the directive; lex a token with the new state.
4495  return false;
4496 
4497 LexNextToken:
4498  Result.clearFlag(Token::NeedsCleaning);
4499  goto LexStart;
4500 }
4501 
4502 const char *Lexer::convertDependencyDirectiveToken(
4503  const dependency_directives_scan::Token &DDTok, Token &Result) {
4504  const char *TokPtr = BufferStart + DDTok.Offset;
4505  Result.startToken();
4506  Result.setLocation(getSourceLocation(TokPtr));
4507  Result.setKind(DDTok.Kind);
4508  Result.setFlag((Token::TokenFlags)DDTok.Flags);
4509  Result.setLength(DDTok.Length);
4510  BufferPtr = TokPtr + DDTok.Length;
4511  return TokPtr;
4512 }
4513 
4514 bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4515  assert(isDependencyDirectivesLexer());
4516 
4517  using namespace dependency_directives_scan;
4518 
4519  while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4520  if (DepDirectives.front().Kind == pp_eof)
4521  return LexEndOfFile(Result, BufferEnd);
4522  if (DepDirectives.front().Kind == tokens_present_before_eof)
4523  MIOpt.ReadToken();
4524  NextDepDirectiveTokenIndex = 0;
4525  DepDirectives = DepDirectives.drop_front();
4526  }
4527 
4528  const dependency_directives_scan::Token &DDTok =
4529  DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4530  if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4531  // Read something other than a preprocessor directive hash.
4532  MIOpt.ReadToken();
4533  }
4534 
4535  if (ParsingFilename && DDTok.is(tok::less)) {
4536  BufferPtr = BufferStart + DDTok.Offset;
4537  LexAngledStringLiteral(Result, BufferPtr + 1);
4538  if (Result.isNot(tok::header_name))
4539  return true;
4540  // Advance the index of lexed tokens.
4541  while (true) {
4542  const dependency_directives_scan::Token &NextTok =
4543  DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4544  if (BufferStart + NextTok.Offset >= BufferPtr)
4545  break;
4546  ++NextDepDirectiveTokenIndex;
4547  }
4548  return true;
4549  }
4550 
4551  const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4552 
4553  if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4554  PP->HandleDirective(Result);
4555  return false;
4556  }
4557  if (Result.is(tok::raw_identifier)) {
4558  Result.setRawIdentifierData(TokPtr);
4559  if (!isLexingRawMode()) {
4560  const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4561  if (II->isHandleIdentifierCase())
4562  return PP->HandleIdentifier(Result);
4563  }
4564  return true;
4565  }
4566  if (Result.isLiteral()) {
4567  Result.setLiteralData(TokPtr);
4568  return true;
4569  }
4570  if (Result.is(tok::colon)) {
4571  // Convert consecutive colons to 'tok::coloncolon'.
4572  if (*BufferPtr == ':') {
4573  assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4574  tok::colon));
4575  ++NextDepDirectiveTokenIndex;
4576  Result.setKind(tok::coloncolon);
4577  }
4578  return true;
4579  }
4580  if (Result.is(tok::eod))
4582 
4583  return true;
4584 }
4585 
4586 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4587  assert(isDependencyDirectivesLexer());
4588 
4589  using namespace dependency_directives_scan;
4590 
4591  bool Stop = false;
4592  unsigned NestedIfs = 0;
4593  do {
4594  DepDirectives = DepDirectives.drop_front();
4595  switch (DepDirectives.front().Kind) {
4596  case pp_none:
4597  llvm_unreachable("unexpected 'pp_none'");
4598  case pp_include:
4599  case pp___include_macros:
4600  case pp_define:
4601  case pp_undef:
4602  case pp_import:
4603  case pp_pragma_import:
4604  case pp_pragma_once:
4605  case pp_pragma_push_macro:
4606  case pp_pragma_pop_macro:
4609  case pp_include_next:
4610  case decl_at_import:
4611  case cxx_module_decl:
4612  case cxx_import_decl:
4616  break;
4617  case pp_if:
4618  case pp_ifdef:
4619  case pp_ifndef:
4620  ++NestedIfs;
4621  break;
4622  case pp_elif:
4623  case pp_elifdef:
4624  case pp_elifndef:
4625  case pp_else:
4626  if (!NestedIfs) {
4627  Stop = true;
4628  }
4629  break;
4630  case pp_endif:
4631  if (!NestedIfs) {
4632  Stop = true;
4633  } else {
4634  --NestedIfs;
4635  }
4636  break;
4637  case pp_eof:
4638  NextDepDirectiveTokenIndex = 0;
4639  return LexEndOfFile(Result, BufferEnd);
4640  }
4641  } while (!Stop);
4642 
4643  const dependency_directives_scan::Token &DDTok =
4644  DepDirectives.front().Tokens.front();
4645  assert(DDTok.is(tok::hash));
4646  NextDepDirectiveTokenIndex = 1;
4647 
4648  convertDependencyDirectiveToken(DDTok, Result);
4649  return false;
4650 }
StringRef P
#define SM(sm)
Definition: Cuda.cpp:83
Defines the Diagnostic-related interfaces.
unsigned Offset
Definition: Format.cpp:2978
Defines the clang::IdentifierInfo, clang::IdentifierTable, and clang::Selector interfaces.
Forward-declares and imports various common LLVM datatypes that clang wants to use unqualified.
Defines the clang::LangOptions interface.
static bool isInExpansionTokenRange(const SourceLocation Loc, const SourceManager &SM)
Definition: Lexer.cpp:948
static const char * findBeginningOfLine(StringRef Buffer, unsigned Offset)
Returns the pointer that points to the beginning of line that contains the given offset,...
Definition: Lexer.cpp:544
static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, bool IsStart, bool &IsExtension)
Definition: Lexer.cpp:1545
static void diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1739
static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs)
DecodeTrigraphChar - If the specified character is a legal trigraph when prefixed with ?...
Definition: Lexer.cpp:1261
static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, const LangOptions &LangOpts, char *Spelling)
Slow case of getSpelling.
Definition: Lexer.cpp:325
static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
After encountering UTF-8 character C and interpreting it as an identifier character,...
Definition: Lexer.cpp:1664
static SourceLocation getBeginningOfFileToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:561
static const char * findPlaceholderEnd(const char *CurPtr, const char *BufferEnd)
Definition: Lexer.cpp:3345
static void StringifyImpl(T &Str, char Quote)
Definition: Lexer.cpp:285
static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen)
GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the lexer buffer was all exp...
Definition: Lexer.cpp:1189
static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1559
static CharSourceRange makeCharRange(Lexer &L, const char *Begin, const char *End)
Definition: Lexer.cpp:1629
static bool isUnicodeWhitespace(uint32_t Codepoint)
Definition: Lexer.cpp:1526
static const char * fastParseASCIIIdentifier(const char *CurPtr, [[maybe_unused]] const char *BufferEnd)
Definition: Lexer.cpp:1905
static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range)
Definition: Lexer.cpp:1613
static const char * FindConflictEnd(const char *CurPtr, const char *BufferEnd, ConflictMarkerKind CMK)
Find the end of a version control conflict marker.
Definition: Lexer.cpp:3241
static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Definition: Lexer.cpp:920
static llvm::SmallString< 5 > codepointAsHexString(uint32_t C)
Definition: Lexer.cpp:1532
static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, bool Trigraphs)
isBlockCommentEndOfEscapedNewLine - Return true if the specified newline character (either \n or \r) ...
Definition: Lexer.cpp:2757
static char GetTrigraphCharForLetter(char Letter)
GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, return the decoded trigraph...
Definition: Lexer.cpp:1242
static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, bool &IsExtension)
Definition: Lexer.cpp:1587
static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, CharSourceRange Range, bool IsFirst)
Definition: Lexer.cpp:1635
Defines the MultipleIncludeOpt interface.
Defines the clang::Preprocessor interface.
SourceRange Range
Definition: SemaObjC.cpp:754
SourceLocation Loc
Definition: SemaObjC.cpp:755
Defines the clang::SourceLocation class and associated facilities.
Defines the SourceManager interface.
Defines the clang::TokenKind enum and support functions.
SourceLocation End
SourceLocation Begin
static const llvm::sys::UnicodeCharRange C11DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange C99DisallowedInitialIDCharRanges[]
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[]
static const llvm::sys::UnicodeCharRange C99AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange C11AllowedIDCharRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDStartRanges[]
static const llvm::sys::UnicodeCharRange MathematicalNotationProfileIDContinueRanges[]
static const llvm::sys::UnicodeCharRange XIDStartRanges[]
static const llvm::sys::UnicodeCharRange XIDContinueRanges[]
__DEVICE__ void * memcpy(void *__a, const void *__b, size_t __c)
__device__ int
__device__ __2f16 float c
__PTRDIFF_TYPE__ ptrdiff_t
static __inline__ int __ATTRS_o_ai vec_any_ge(vector signed char __a, vector signed char __b)
Definition: altivec.h:16260
static __inline__ int __ATTRS_o_ai vec_any_eq(vector signed char __a, vector signed char __b)
Definition: altivec.h:16052
Represents a character-granular source range.
static CharSourceRange getCharRange(SourceRange R)
SourceLocation getEnd() const
SourceLocation getBegin() const
SourceLocation getLocation() const
Definition: DeclBase.h:445
void setLocation(SourceLocation L)
Definition: DeclBase.h:446
A little helper class used to produce diagnostics.
Definition: Diagnostic.h:1277
Concrete class used by the front-end to report problems and issues.
Definition: Diagnostic.h:193
DiagnosticBuilder Report(SourceLocation Loc, unsigned DiagID)
Issue the message to the client.
Definition: Diagnostic.h:1553
bool isIgnored(unsigned DiagID, SourceLocation Loc) const
Determine whether the diagnostic is known to be ignored.
Definition: Diagnostic.h:922
An opaque identifier used by SourceManager which refers to a source file (MemoryBuffer) along with it...
bool isInvalid() const
static FixItHint CreateReplacement(CharSourceRange RemoveRange, StringRef Code)
Create a code modification hint that replaces the given source range with the given code string.
Definition: Diagnostic.h:135
static FixItHint CreateRemoval(CharSourceRange RemoveRange)
Create a code modification hint that removes the given source range.
Definition: Diagnostic.h:124
static FixItHint CreateInsertion(SourceLocation InsertionLoc, StringRef Code, bool BeforePreviousInsertions=false)
Create a code modification hint that inserts the given code string at a specific location.
Definition: Diagnostic.h:98
One of these records is kept for each identifier that is lexed.
bool isHandleIdentifierCase() const
Return true if the Preprocessor::HandleIdentifier must be called on a token of this identifier.
bool isKeyword(const LangOptions &LangOpts) const
Return true if this token is a keyword in the specified language.
tok::ObjCKeywordKind getObjCKeywordID() const
Return the Objective-C keyword ID for the this identifier.
IdentifierInfo & get(StringRef Name)
Return the identifier token info for the specified named identifier.
Keeps track of the various options that can be enabled, which controls the dialect of C or C++ that i...
Definition: LangOptions.h:482
Lexer - This provides a simple interface that turns a text buffer into a stream of tokens.
Definition: Lexer.h:78
static StringRef getSourceText(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts, bool *Invalid=nullptr)
Returns a string for the source that the range encompasses.
Definition: Lexer.cpp:1024
void SetKeepWhitespaceMode(bool Val)
SetKeepWhitespaceMode - This method lets clients enable or disable whitespace retention mode.
Definition: Lexer.h:254
static SourceLocation findLocationAfterToken(SourceLocation loc, tok::TokenKind TKind, const SourceManager &SM, const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine)
Checks that the given token is the first token that occurs after the given location (this excludes co...
Definition: Lexer.cpp:1358
bool LexFromRawLexer(Token &Result)
LexFromRawLexer - Lex a token from a designated raw lexer (one with no associated preprocessor object...
Definition: Lexer.h:236
bool inKeepCommentMode() const
inKeepCommentMode - Return true if the lexer should return comments as tokens.
Definition: Lexer.h:262
void SetCommentRetentionState(bool Mode)
SetCommentRetentionMode - Change the comment retention mode of the lexer to the specified mode.
Definition: Lexer.h:269
void seek(unsigned Offset, bool IsAtStartOfLine)
Set the lexer's buffer pointer to Offset.
Definition: Lexer.cpp:278
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1060
void ReadToEndOfLine(SmallVectorImpl< char > *Result=nullptr)
ReadToEndOfLine - Read the rest of the current preprocessor line as an uninterpreted string.
Definition: Lexer.cpp:3065
static bool isAtStartOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroBegin=nullptr)
Returns true if the given MacroID location points at the first token of the macro expansion.
Definition: Lexer.cpp:872
DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const
Diag - Forwarding function for diagnostics.
Definition: Lexer.cpp:1232
bool Lex(Token &Result)
Lex - Return the next token in the file.
Definition: Lexer.cpp:3678
bool isPragmaLexer() const
isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
Definition: Lexer.h:225
static unsigned getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, const SourceManager &SM, const LangOptions &LangOpts)
Get the physical length (including trigraphs and escaped newlines) of the first Characters characters...
Definition: Lexer.cpp:791
Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP, bool IsFirstIncludeOfFile=true)
Lexer constructor - Create a new lexer object for the specified buffer with the specified preprocesso...
Definition: Lexer.cpp:184
static bool isAtEndOfMacroExpansion(SourceLocation loc, const SourceManager &SM, const LangOptions &LangOpts, SourceLocation *MacroEnd=nullptr)
Returns true if the given MacroID location points at the last token of the macro expansion.
Definition: Lexer.cpp:894
SourceLocation getSourceLocation() override
getSourceLocation - Return a source location for the next character in the current file.
Definition: Lexer.h:303
static CharSourceRange makeFileCharRange(CharSourceRange Range, const SourceManager &SM, const LangOptions &LangOpts)
Accepts a range and returns a character range with file locations.
Definition: Lexer.cpp:955
static bool isNewLineEscaped(const char *BufferStart, const char *Str)
Checks whether new line pointed by Str is preceded by escape sequence.
Definition: Lexer.cpp:1138
const char * getBufferLocation() const
Return the current location in the buffer.
Definition: Lexer.h:308
SourceLocation getSourceLocation(const char *Loc, unsigned TokLen=1) const
getSourceLocation - Return a source location identifier for the specified offset in the current file.
Definition: Lexer.cpp:1213
static StringRef getIndentationForLine(SourceLocation Loc, const SourceManager &SM)
Returns the leading whitespace for line that corresponds to the given location Loc.
Definition: Lexer.cpp:1158
static unsigned getSpelling(const Token &Tok, const char *&Buffer, const SourceManager &SourceMgr, const LangOptions &LangOpts, bool *Invalid=nullptr)
getSpelling - This method is used to get the spelling of a token into a preallocated buffer,...
Definition: Lexer.cpp:452
bool isKeepWhitespaceMode() const
isKeepWhitespaceMode - Return true if the lexer should return tokens for every character in the file,...
Definition: Lexer.h:248
static bool isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts)
Returns true if the given character could appear in an identifier.
Definition: Lexer.cpp:1134
static unsigned MeasureTokenLength(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
MeasureTokenLength - Relex the token at the specified location and return its length in bytes in the ...
Definition: Lexer.cpp:499
static SourceLocation GetBeginningOfToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Given a location any where in a source buffer, find the location that corresponds to the beginning of...
Definition: Lexer.cpp:609
void resetExtendedTokenMode()
Sets the extended token mode back to its initial value, according to the language options and preproc...
Definition: Lexer.cpp:220
static StringRef getImmediateMacroNameForDiagnostics(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1107
static Lexer * Create_PragmaLexer(SourceLocation SpellingLoc, SourceLocation ExpansionLocStart, SourceLocation ExpansionLocEnd, unsigned TokLen, Preprocessor &PP)
Create_PragmaLexer: Lexer constructor - Create a new lexer object for _Pragma expansion.
Definition: Lexer.cpp:243
static PreambleBounds ComputePreamble(StringRef Buffer, const LangOptions &LangOpts, unsigned MaxLines=0)
Compute the preamble of the given file.
Definition: Lexer.cpp:637
static bool getRawToken(SourceLocation Loc, Token &Result, const SourceManager &SM, const LangOptions &LangOpts, bool IgnoreWhiteSpace=false)
Relex the token at the specified location.
Definition: Lexer.cpp:510
static std::optional< Token > findNextToken(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Finds the token that comes right after the given location.
Definition: Lexer.cpp:1325
static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, const SourceManager &SM, const LangOptions &LangOpts)
Computes the source location just past the end of the token at this source location.
Definition: Lexer.cpp:850
static std::string Stringify(StringRef Str, bool Charify=false)
Stringify - Convert the specified string into a C string by i) escaping '\' and " characters and ii) ...
Definition: Lexer.cpp:310
static SizedChar getCharAndSizeNoWarn(const char *Ptr, const LangOptions &LangOpts)
getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever emit a warning.
Definition: Lexer.h:586
void ExitTopLevelConditional()
Called when the lexer exits the top-level conditional.
bool LexingRawMode
True if in raw mode.
SmallVector< PPConditionalInfo, 4 > ConditionalStack
Information about the set of #if/#ifdef/#ifndef blocks we are currently in.
bool ParsingPreprocessorDirective
True when parsing #XXX; turns '\n' into a tok::eod token.
MultipleIncludeOpt MIOpt
A state machine that detects the #ifndef-wrapping a file idiom for the multiple-include optimization.
bool ParsingFilename
True after #include; turns <xx> or "xxx" into a tok::header_name token.
bool isLexingRawMode() const
Return true if this lexer is in raw mode or not.
const FileID FID
The SourceManager FileID corresponding to the file being lexed.
bool LexEditorPlaceholders
When enabled, the preprocessor will construct editor placeholder tokens.
Engages in a tight little dance with the lexer to efficiently preprocess tokens.
Definition: Preprocessor.h:128
PreprocessorOptions & getPreprocessorOpts() const
Retrieve the preprocessor options used to initialize this preprocessor.
SourceLocation getCodeCompletionLoc() const
Returns the location of the code-completion point.
SourceManager & getSourceManager() const
SourceLocation getCodeCompletionFileLoc() const
Returns the start location of the file of code-completion point.
void setCodeCompletionTokenRange(const SourceLocation Start, const SourceLocation End)
Set the code completion token range for detecting replacement range later on.
bool isRecordingPreamble() const
void setRecordedPreambleConditionalStack(ArrayRef< PPConditionalInfo > s)
bool isInPrimaryFile() const
Return true if we're in the top-level file, not in a #include.
void CreateString(StringRef Str, Token &Tok, SourceLocation ExpansionLocStart=SourceLocation(), SourceLocation ExpansionLocEnd=SourceLocation())
Plop the specified string into a scratch buffer and set the specified token's location and length to ...
IdentifierInfo * LookUpIdentifierInfo(Token &Identifier) const
Given a tok::raw_identifier token, look up the identifier information for the token and install it in...
bool isPreprocessedOutput() const
Returns true if the preprocessor is responsible for generating output, false if it is producing token...
bool HandleIdentifier(Token &Identifier)
Callback invoked when the lexer reads an identifier and has filled in the tokens IdentifierInfo membe...
DiagnosticsEngine & getDiagnostics() const
IdentifierTable & getIdentifierTable()
bool getCommentRetentionState() const
bool hadModuleLoaderFatalFailure() const
StringRef getSpelling(SourceLocation loc, SmallVectorImpl< char > &buffer, bool *invalid=nullptr) const
Return the 'spelling' of the token at the given location; does not go up to the spelling location or ...
bool HandleComment(Token &result, SourceRange Comment)
bool isCodeCompletionEnabled() const
Determine if we are performing code completion.
void HandleDirective(Token &Result)
Callback invoked when the lexer sees a # token at the start of a line.
void CodeCompleteIncludedFile(llvm::StringRef Dir, bool IsAngled)
Hook used by the lexer to invoke the "included file" code completion point.
EmptylineHandler * getEmptylineHandler() const
const LangOptions & getLangOpts() const
void CodeCompleteNaturalLanguage()
Hook used by the lexer to invoke the "natural language" code completion point.
bool HandleEndOfFile(Token &Result, bool isEndOfMacro=false)
Callback invoked when the lexer hits the end of the current file.
void setCodeCompletionIdentifierInfo(IdentifierInfo *Filter)
Set the code completion token for filtering purposes.
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) const
Forwarding function for diagnostics.
Encodes a location in the source.
static SourceLocation getFromRawEncoding(UIntTy Encoding)
Turn a raw encoding of a SourceLocation object into a real SourceLocation.
bool isValid() const
Return true if this is a valid SourceLocation object.
SourceLocation getLocWithOffset(IntTy Offset) const
Return a source location with the specified offset from this SourceLocation.
UIntTy getRawEncoding() const
When a SourceLocation itself cannot be used, this returns an (opaque) 32-bit integer encoding for it.
This class handles loading and caching of source files into memory.
const char * getCharacterData(SourceLocation SL, bool *Invalid=nullptr) const
Return a pointer to the start of the specified location in the appropriate spelling MemoryBuffer.
A trivial tuple used to represent a source range.
void setBegin(SourceLocation b)
bool isInvalid() const
SourceLocation getEnd() const
SourceLocation getBegin() const
void setEnd(SourceLocation e)
Each ExpansionInfo encodes the expansion location - where the token was ultimately expanded,...
SourceLocation getExpansionLocStart() const
SourceLocation getSpellingLoc() const
This is a discriminated union of FileInfo and ExpansionInfo.
const ExpansionInfo & getExpansion() const
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix)
Determine whether a suffix is a valid ud-suffix.
Token - This structure provides full information about a lexed token.
Definition: Token.h:36
bool hasUCN() const
Returns true if this token contains a universal character name.
Definition: Token.h:306
bool isLiteral() const
Return true if this is a "literal", like a numeric constant, string, etc.
Definition: Token.h:116
SourceLocation getLocation() const
Return a source location identifier for the specified offset in the current file.
Definition: Token.h:132
unsigned getLength() const
Definition: Token.h:135
tok::ObjCKeywordKind getObjCKeywordID() const
Return the ObjC keyword kind.
Definition: Lexer.cpp:70
bool is(tok::TokenKind K) const
is/isNot - Predicates to check if this token is a specific kind, as in "if (Tok.is(tok::l_brace)) {....
Definition: Token.h:99
const char * getLiteralData() const
getLiteralData - For a literal token (numeric constant, string, etc), this returns a pointer to the s...
Definition: Token.h:225
tok::TokenKind getKind() const
Definition: Token.h:94
bool isAtStartOfLine() const
isAtStartOfLine - Return true if this token is at the start of a line.
Definition: Token.h:276
IdentifierInfo * getIdentifierInfo() const
Definition: Token.h:187
@ HasUCN
Definition: Token.h:83
@ IsEditorPlaceholder
Definition: Token.h:88
@ LeadingEmptyMacro
Definition: Token.h:81
@ LeadingSpace
Definition: Token.h:77
@ StartOfLine
Definition: Token.h:75
@ HasUDSuffix
Definition: Token.h:82
@ NeedsCleaning
Definition: Token.h:80
bool isAnnotation() const
Return true if this is any of tok::annot_* kind tokens.
Definition: Token.h:121
bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const
Return true if we have an ObjC keyword identifier.
Definition: Lexer.cpp:61
bool isSimpleTypeSpecifier(const LangOptions &LangOpts) const
Determine whether the token kind starts a simple-type-specifier.
Definition: Lexer.cpp:78
void startToken()
Reset all flags to cleared.
Definition: Token.h:177
bool needsCleaning() const
Return true if this token has trigraphs or escaped newlines in it.
Definition: Token.h:295
StringRef getRawIdentifier() const
getRawIdentifier - For a raw identifier token (i.e., an identifier lexed in raw mode),...
Definition: Token.h:213
void setFlag(TokenFlags Flag)
Set the specified flag.
Definition: Token.h:244
static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a)
Copies the values of the most significant bits from each 8-bit element in a 128-bit integer vector of...
Definition: emmintrin.h:4263
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a, __m128i __b)
Compares each of the corresponding 8-bit values of the 128-bit integer vectors for equality.
Definition: emmintrin.h:3075
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si128(__m128i_u const *__p)
Moves packed integer values from an unaligned 128-bit memory location to elements in a 128-bit intege...
Definition: emmintrin.h:3440
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b)
Initializes all values in a 128-bit vector of [16 x i8] with the specified 8-bit value.
Definition: emmintrin.h:3723
static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_load_si128(__m128i const *__p)
Moves packed integer values from an aligned 128-bit memory location to elements in a 128-bit integer ...
Definition: emmintrin.h:3425
@ tokens_present_before_eof
Indicates that there are tokens present between the last scanned directive and eof.
@ After
Like System, but searched after the system directories.
bool isStringLiteral(TokenKind K)
Return true if this is a C or C++ string-literal (or C++11 user-defined-string-literal) token.
Definition: TokenKinds.h:89
ObjCKeywordKind
Provides a namespace for Objective-C keywords which start with an '@'.
Definition: TokenKinds.h:41
TokenKind
Provides a simple uniform namespace for tokens from all C languages.
Definition: TokenKinds.h:25
The JSON file list parser is used to communicate input to InstallAPI.
LLVM_READNONE bool isASCII(char c)
Returns true if a byte is an ASCII character.
Definition: CharInfo.h:41
LLVM_READONLY bool isVerticalWhitespace(unsigned char c)
Returns true if this character is vertical ASCII whitespace: '\n', '\r'.
Definition: CharInfo.h:99
ConflictMarkerKind
ConflictMarkerKind - Kinds of conflict marker which the lexer might be recovering from.
Definition: Lexer.h:44
@ CMK_Perforce
A Perforce-style conflict marker, initiated by 4 ">"s, separated by 4 "="s, and terminated by 4 "<"s.
Definition: Lexer.h:54
@ CMK_None
Not within a conflict marker.
Definition: Lexer.h:46
@ CMK_Normal
A normal or diff3 conflict marker, initiated by at least 7 "<"s, separated by at least 7 "="s or "|"s...
Definition: Lexer.h:50
@ LineComment
Definition: LangStandard.h:50
LLVM_READONLY bool isAsciiIdentifierContinue(unsigned char c)
Definition: CharInfo.h:61
bool operator<(DeclarationName LHS, DeclarationName RHS)
Ordering on two declaration names.
LLVM_READONLY bool isHorizontalWhitespace(unsigned char c)
Returns true if this character is horizontal ASCII whitespace: ' ', '\t', '\f', '\v'.
Definition: CharInfo.h:91
LLVM_READONLY bool isRawStringDelimBody(unsigned char c)
Return true if this is the body character of a C++ raw string delimiter.
Definition: CharInfo.h:176
LLVM_READONLY bool isWhitespace(unsigned char c)
Return true if this character is horizontal or vertical ASCII whitespace: ' ', '\t',...
Definition: CharInfo.h:108
LLVM_READONLY bool isPreprocessingNumberBody(unsigned char c)
Return true if this is the body character of a C preprocessing number, which is [a-zA-Z0-9_.
Definition: CharInfo.h:169
const FunctionProtoType * T
LLVM_READONLY bool isAsciiIdentifierStart(unsigned char c, bool AllowDollar=false)
Returns true if this is a valid first character of a C identifier, which is [a-zA-Z_].
Definition: CharInfo.h:53
Definition: Format.h:5433
__INTPTR_TYPE__ intptr_t
A signed integer type with the property that any valid pointer to void can be converted to this type,...
float __ovld __cnfn length(float)
Return the length of vector p, i.e., sqrt(p.x2 + p.y 2 + ...)
#define _SIDD_UBYTE_OPS
Definition: smmintrin.h:1526
#define _mm_cmpistri(A, B, M)
Uses the immediate operand M to perform a comparison of string data with implicitly defined lengths t...
Definition: smmintrin.h:1658
#define _SIDD_LEAST_SIGNIFICANT
Definition: smmintrin.h:1544
#define _SIDD_NEGATIVE_POLARITY
Definition: smmintrin.h:1539
#define _SIDD_CMP_RANGES
Definition: smmintrin.h:1533
Represents a char and the number of bytes parsed to produce it.
Definition: Lexer.h:579
Describes the bounds (start, size) of the preamble and a flag required by PreprocessorOptions::Precom...
Definition: Lexer.h:60
Token lexed as part of dependency directive scanning.
unsigned Offset
Offset into the original source input.