clang  19.0.0git
ArrayBoundCheckerV2.cpp
Go to the documentation of this file.
1 //== ArrayBoundCheckerV2.cpp ------------------------------------*- C++ -*--==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file defines ArrayBoundCheckerV2, which is a path-sensitive check
10 // which looks for an out-of-bound array element access.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/AST/CharUnits.h"
25 #include "llvm/ADT/SmallString.h"
26 #include "llvm/Support/FormatVariadic.h"
27 #include "llvm/Support/raw_ostream.h"
28 #include <optional>
29 
30 using namespace clang;
31 using namespace ento;
32 using namespace taint;
33 using llvm::formatv;
34 
35 namespace {
36 /// If `E` is a "clean" array subscript expression, return the type of the
37 /// accessed element. If the base of the subscript expression is modified by
38 /// pointer arithmetic (and not the beginning of a "full" memory region), this
39 /// always returns nullopt because that's the right (or the least bad) thing to
40 /// do for the diagnostic output that's relying on this.
41 static std::optional<QualType> determineElementType(const Expr *E,
42  const CheckerContext &C) {
43  const auto *ASE = dyn_cast<ArraySubscriptExpr>(E);
44  if (!ASE)
45  return std::nullopt;
46 
47  const MemRegion *SubscriptBaseReg = C.getSVal(ASE->getBase()).getAsRegion();
48  if (!SubscriptBaseReg)
49  return std::nullopt;
50 
51  // The base of the subscript expression is affected by pointer arithmetics,
52  // so we want to report byte offsets instead of indices.
53  if (isa<ElementRegion>(SubscriptBaseReg->StripCasts()))
54  return std::nullopt;
55 
56  return ASE->getType();
57 }
58 
59 static std::optional<int64_t>
60 determineElementSize(const std::optional<QualType> T, const CheckerContext &C) {
61  if (!T)
62  return std::nullopt;
63  return C.getASTContext().getTypeSizeInChars(*T).getQuantity();
64 }
65 
66 class StateUpdateReporter {
67  const SubRegion *Reg;
68  const NonLoc ByteOffsetVal;
69  const std::optional<QualType> ElementType;
70  const std::optional<int64_t> ElementSize;
71  bool AssumedNonNegative = false;
72  std::optional<NonLoc> AssumedUpperBound = std::nullopt;
73 
74 public:
75  StateUpdateReporter(const SubRegion *R, NonLoc ByteOffsVal, const Expr *E,
76  CheckerContext &C)
77  : Reg(R), ByteOffsetVal(ByteOffsVal),
78  ElementType(determineElementType(E, C)),
79  ElementSize(determineElementSize(ElementType, C)) {}
80 
81  void recordNonNegativeAssumption() { AssumedNonNegative = true; }
82  void recordUpperBoundAssumption(NonLoc UpperBoundVal) {
83  AssumedUpperBound = UpperBoundVal;
84  }
85 
86  bool assumedNonNegative() { return AssumedNonNegative; }
87 
88  const NoteTag *createNoteTag(CheckerContext &C) const;
89 
90 private:
91  std::string getMessage(PathSensitiveBugReport &BR) const;
92 
93  /// Return true if information about the value of `Sym` can put constraints
94  /// on some symbol which is interesting within the bug report `BR`.
95  /// In particular, this returns true when `Sym` is interesting within `BR`;
96  /// but it also returns true if `Sym` is an expression that contains integer
97  /// constants and a single symbolic operand which is interesting (in `BR`).
98  /// We need to use this instead of plain `BR.isInteresting()` because if we
99  /// are analyzing code like
100  /// int array[10];
101  /// int f(int arg) {
102  /// return array[arg] && array[arg + 10];
103  /// }
104  /// then the byte offsets are `arg * 4` and `(arg + 10) * 4`, which are not
105  /// sub-expressions of each other (but `getSimplifiedOffsets` is smart enough
106  /// to detect this out of bounds access).
107  static bool providesInformationAboutInteresting(SymbolRef Sym,
109  static bool providesInformationAboutInteresting(SVal SV,
111  return providesInformationAboutInteresting(SV.getAsSymbol(), BR);
112  }
113 };
114 
115 struct Messages {
116  std::string Short, Full;
117 };
118 
119 // NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
120 // instead of `PreStmt` because the current implementation passes the whole
121 // expression to `CheckerContext::getSVal()` which only works after the
122 // symbolic evaluation of the expression. (To turn them into `PreStmt`
123 // callbacks, we'd need to duplicate the logic that evaluates these
124 // expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
125 // defined as `PostStmt` for the sake of consistency with the other callbacks.
126 class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
127  check::PostStmt<UnaryOperator>,
128  check::PostStmt<MemberExpr>> {
129  BugType BT{this, "Out-of-bound access"};
130  BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
131 
132  void performCheck(const Expr *E, CheckerContext &C) const;
133 
134  void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, Messages Msgs,
135  NonLoc Offset, std::optional<NonLoc> Extent,
136  bool IsTaintBug = false) const;
137 
138  static void markPartsInteresting(PathSensitiveBugReport &BR,
139  ProgramStateRef ErrorState, NonLoc Val,
140  bool MarkTaint);
141 
142  static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
143 
144  static bool isIdiomaticPastTheEndPtr(const Expr *E, ProgramStateRef State,
145  NonLoc Offset, NonLoc Limit,
146  CheckerContext &C);
147  static bool isInAddressOf(const Stmt *S, ASTContext &AC);
148 
149 public:
150  void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
151  performCheck(E, C);
152  }
153  void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
154  if (E->getOpcode() == UO_Deref)
155  performCheck(E, C);
156  }
157  void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
158  if (E->isArrow())
159  performCheck(E->getBase(), C);
160  }
161 };
162 
163 } // anonymous namespace
164 
165 /// For a given Location that can be represented as a symbolic expression
166 /// Arr[Idx] (or perhaps Arr[Idx1][Idx2] etc.), return the parent memory block
167 /// Arr and the distance of Location from the beginning of Arr (expressed in a
168 /// NonLoc that specifies the number of CharUnits). Returns nullopt when these
169 /// cannot be determined.
170 static std::optional<std::pair<const SubRegion *, NonLoc>>
172  QualType T = SVB.getArrayIndexType();
173  auto EvalBinOp = [&SVB, State, T](BinaryOperatorKind Op, NonLoc L, NonLoc R) {
174  // We will use this utility to add and multiply values.
175  return SVB.evalBinOpNN(State, Op, L, R, T).getAs<NonLoc>();
176  };
177 
178  const SubRegion *OwnerRegion = nullptr;
179  std::optional<NonLoc> Offset = SVB.makeZeroArrayIndex();
180 
181  const ElementRegion *CurRegion =
182  dyn_cast_or_null<ElementRegion>(Location.getAsRegion());
183 
184  while (CurRegion) {
185  const auto Index = CurRegion->getIndex().getAs<NonLoc>();
186  if (!Index)
187  return std::nullopt;
188 
189  QualType ElemType = CurRegion->getElementType();
190 
191  // FIXME: The following early return was presumably added to safeguard the
192  // getTypeSizeInChars() call (which doesn't accept an incomplete type), but
193  // it seems that `ElemType` cannot be incomplete at this point.
194  if (ElemType->isIncompleteType())
195  return std::nullopt;
196 
197  // Calculate Delta = Index * sizeof(ElemType).
198  NonLoc Size = SVB.makeArrayIndex(
199  SVB.getContext().getTypeSizeInChars(ElemType).getQuantity());
200  auto Delta = EvalBinOp(BO_Mul, *Index, Size);
201  if (!Delta)
202  return std::nullopt;
203 
204  // Perform Offset += Delta.
205  Offset = EvalBinOp(BO_Add, *Offset, *Delta);
206  if (!Offset)
207  return std::nullopt;
208 
209  OwnerRegion = CurRegion->getSuperRegion()->getAs<SubRegion>();
210  // When this is just another ElementRegion layer, we need to continue the
211  // offset calculations:
212  CurRegion = dyn_cast_or_null<ElementRegion>(OwnerRegion);
213  }
214 
215  if (OwnerRegion)
216  return std::make_pair(OwnerRegion, *Offset);
217 
218  return std::nullopt;
219 }
220 
221 // NOTE: This function is the "heart" of this checker. It simplifies
222 // inequalities with transformations that are valid (and very elementary) in
223 // pure mathematics, but become invalid if we use them in C++ number model
224 // where the calculations may overflow.
225 // Due to the overflow issues I think it's impossible (or at least not
226 // practical) to integrate this kind of simplification into the resolution of
227 // arbitrary inequalities (i.e. the code of `evalBinOp`); but this function
228 // produces valid results when the calculations are handling memory offsets
229 // and every value is well below SIZE_MAX.
230 // TODO: This algorithm should be moved to a central location where it's
231 // available for other checkers that need to compare memory offsets.
232 // NOTE: the simplification preserves the order of the two operands in a
233 // mathematical sense, but it may change the result produced by a C++
234 // comparison operator (and the automatic type conversions).
235 // For example, consider a comparison "X+1 < 0", where the LHS is stored as a
236 // size_t and the RHS is stored in an int. (As size_t is unsigned, this
237 // comparison is false for all values of "X".) However, the simplification may
238 // turn it into "X < -1", which is still always false in a mathematical sense,
239 // but can produce a true result when evaluated by `evalBinOp` (which follows
240 // the rules of C++ and casts -1 to SIZE_MAX).
241 static std::pair<NonLoc, nonloc::ConcreteInt>
243  SValBuilder &svalBuilder) {
244  std::optional<nonloc::SymbolVal> SymVal = offset.getAs<nonloc::SymbolVal>();
245  if (SymVal && SymVal->isExpression()) {
246  if (const SymIntExpr *SIE = dyn_cast<SymIntExpr>(SymVal->getSymbol())) {
247  llvm::APSInt constant =
248  APSIntType(extent.getValue()).convert(SIE->getRHS());
249  switch (SIE->getOpcode()) {
250  case BO_Mul:
251  // The constant should never be 0 here, becasue multiplication by zero
252  // is simplified by the engine.
253  if ((extent.getValue() % constant) != 0)
254  return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
255  else
256  return getSimplifiedOffsets(
257  nonloc::SymbolVal(SIE->getLHS()),
258  svalBuilder.makeIntVal(extent.getValue() / constant),
259  svalBuilder);
260  case BO_Add:
261  return getSimplifiedOffsets(
262  nonloc::SymbolVal(SIE->getLHS()),
263  svalBuilder.makeIntVal(extent.getValue() - constant), svalBuilder);
264  default:
265  break;
266  }
267  }
268  }
269 
270  return std::pair<NonLoc, nonloc::ConcreteInt>(offset, extent);
271 }
272 
274  const llvm::APSInt *MaxV = SVB.getMaxValue(State, Value);
275  return MaxV && MaxV->isNegative();
276 }
277 
278 static bool isUnsigned(SValBuilder &SVB, NonLoc Value) {
279  QualType T = Value.getType(SVB.getContext());
280  return T->isUnsignedIntegerType();
281 }
282 
283 // Evaluate the comparison Value < Threshold with the help of the custom
284 // simplification algorithm defined for this checker. Return a pair of states,
285 // where the first one corresponds to "value below threshold" and the second
286 // corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
287 // the case when the evaluation fails.
288 // If the optional argument CheckEquality is true, then use BO_EQ instead of
289 // the default BO_LT after consistently applying the same simplification steps.
290 static std::pair<ProgramStateRef, ProgramStateRef>
292  SValBuilder &SVB, bool CheckEquality = false) {
293  if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
294  std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
295  }
296 
297  // We want to perform a _mathematical_ comparison between the numbers `Value`
298  // and `Threshold`; but `evalBinOpNN` evaluates a C/C++ operator that may
299  // perform automatic conversions. For example the number -1 is less than the
300  // number 1000, but -1 < `1000ull` will evaluate to `false` because the `int`
301  // -1 is converted to ULONGLONG_MAX.
302  // To avoid automatic conversions, we evaluate the "obvious" cases without
303  // calling `evalBinOpNN`:
304  if (isNegative(SVB, State, Value) && isUnsigned(SVB, Threshold)) {
305  if (CheckEquality) {
306  // negative_value == unsigned_threshold is always false
307  return {nullptr, State};
308  }
309  // negative_value < unsigned_threshold is always true
310  return {State, nullptr};
311  }
312  if (isUnsigned(SVB, Value) && isNegative(SVB, State, Threshold)) {
313  // unsigned_value == negative_threshold and
314  // unsigned_value < negative_threshold are both always false
315  return {nullptr, State};
316  }
317  // FIXME: These special cases are sufficient for handling real-world
318  // comparisons, but in theory there could be contrived situations where
319  // automatic conversion of a symbolic value (which can be negative and can be
320  // positive) leads to incorrect results.
321  // NOTE: We NEED to use the `evalBinOpNN` call in the "common" case, because
322  // we want to ensure that assumptions coming from this precondition and
323  // assumptions coming from regular C/C++ operator calls are represented by
324  // constraints on the same symbolic expression. A solution that would
325  // evaluate these "mathematical" compariosns through a separate pathway would
326  // be a step backwards in this sense.
327 
328  const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
329  auto BelowThreshold =
330  SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
331  .getAs<NonLoc>();
332 
333  if (BelowThreshold)
334  return State->assume(*BelowThreshold);
335 
336  return {nullptr, nullptr};
337 }
338 
339 static std::string getRegionName(const SubRegion *Region) {
340  if (std::string RegName = Region->getDescriptiveName(); !RegName.empty())
341  return RegName;
342 
343  // Field regions only have descriptive names when their parent has a
344  // descriptive name; so we provide a fallback representation for them:
345  if (const auto *FR = Region->getAs<FieldRegion>()) {
346  if (StringRef Name = FR->getDecl()->getName(); !Name.empty())
347  return formatv("the field '{0}'", Name);
348  return "the unnamed field";
349  }
350 
351  if (isa<AllocaRegion>(Region))
352  return "the memory returned by 'alloca'";
353 
354  if (isa<SymbolicRegion>(Region) &&
355  isa<HeapSpaceRegion>(Region->getMemorySpace()))
356  return "the heap area";
357 
358  if (isa<StringRegion>(Region))
359  return "the string literal";
360 
361  return "the region";
362 }
363 
364 static std::optional<int64_t> getConcreteValue(NonLoc SV) {
365  if (auto ConcreteVal = SV.getAs<nonloc::ConcreteInt>()) {
366  return ConcreteVal->getValue().tryExtValue();
367  }
368  return std::nullopt;
369 }
370 
371 static std::optional<int64_t> getConcreteValue(std::optional<NonLoc> SV) {
372  return SV ? getConcreteValue(*SV) : std::nullopt;
373 }
374 
375 static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
376  std::string RegName = getRegionName(Region);
377  SmallString<128> Buf;
378  llvm::raw_svector_ostream Out(Buf);
379  Out << "Access of " << RegName << " at negative byte offset";
380  if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
381  Out << ' ' << ConcreteIdx->getValue();
382  return {formatv("Out of bound access to memory preceding {0}", RegName),
383  std::string(Buf)};
384 }
385 
386 /// Try to divide `Val1` and `Val2` (in place) by `Divisor` and return true if
387 /// it can be performed (`Divisor` is nonzero and there is no remainder). The
388 /// values `Val1` and `Val2` may be nullopt and in that case the corresponding
389 /// division is considered to be successful.
390 static bool tryDividePair(std::optional<int64_t> &Val1,
391  std::optional<int64_t> &Val2, int64_t Divisor) {
392  if (!Divisor)
393  return false;
394  const bool Val1HasRemainder = Val1 && *Val1 % Divisor;
395  const bool Val2HasRemainder = Val2 && *Val2 % Divisor;
396  if (!Val1HasRemainder && !Val2HasRemainder) {
397  if (Val1)
398  *Val1 /= Divisor;
399  if (Val2)
400  *Val2 /= Divisor;
401  return true;
402  }
403  return false;
404 }
405 
406 static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
407  NonLoc Offset, NonLoc Extent, SVal Location,
408  bool AlsoMentionUnderflow) {
409  std::string RegName = getRegionName(Region);
410  const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
411  assert(EReg && "this checker only handles element access");
412  QualType ElemType = EReg->getElementType();
413 
414  std::optional<int64_t> OffsetN = getConcreteValue(Offset);
415  std::optional<int64_t> ExtentN = getConcreteValue(Extent);
416 
417  int64_t ElemSize = ACtx.getTypeSizeInChars(ElemType).getQuantity();
418 
419  bool UseByteOffsets = !tryDividePair(OffsetN, ExtentN, ElemSize);
420  const char *OffsetOrIndex = UseByteOffsets ? "byte offset" : "index";
421 
422  SmallString<256> Buf;
423  llvm::raw_svector_ostream Out(Buf);
424  Out << "Access of ";
425  if (!ExtentN && !UseByteOffsets)
426  Out << "'" << ElemType.getAsString() << "' element in ";
427  Out << RegName << " at ";
428  if (AlsoMentionUnderflow) {
429  Out << "a negative or overflowing " << OffsetOrIndex;
430  } else if (OffsetN) {
431  Out << OffsetOrIndex << " " << *OffsetN;
432  } else {
433  Out << "an overflowing " << OffsetOrIndex;
434  }
435  if (ExtentN) {
436  Out << ", while it holds only ";
437  if (*ExtentN != 1)
438  Out << *ExtentN;
439  else
440  Out << "a single";
441  if (UseByteOffsets)
442  Out << " byte";
443  else
444  Out << " '" << ElemType.getAsString() << "' element";
445 
446  if (*ExtentN > 1)
447  Out << "s";
448  }
449 
450  return {formatv("Out of bound access to memory {0} {1}",
451  AlsoMentionUnderflow ? "around" : "after the end of",
452  RegName),
453  std::string(Buf)};
454 }
455 
456 static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName,
457  bool AlsoMentionUnderflow) {
458  std::string RegName = getRegionName(Region);
459  return {formatv("Potential out of bound access to {0} with tainted {1}",
460  RegName, OffsetName),
461  formatv("Access of {0} with a tainted {1} that may be {2}too large",
462  RegName, OffsetName,
463  AlsoMentionUnderflow ? "negative or " : "")};
464 }
465 
466 const NoteTag *StateUpdateReporter::createNoteTag(CheckerContext &C) const {
467  // Don't create a note tag if we didn't assume anything:
468  if (!AssumedNonNegative && !AssumedUpperBound)
469  return nullptr;
470 
471  return C.getNoteTag([*this](PathSensitiveBugReport &BR) -> std::string {
472  return getMessage(BR);
473  });
474 }
475 
476 std::string StateUpdateReporter::getMessage(PathSensitiveBugReport &BR) const {
477  bool ShouldReportNonNegative = AssumedNonNegative;
478  if (!providesInformationAboutInteresting(ByteOffsetVal, BR)) {
479  if (AssumedUpperBound &&
480  providesInformationAboutInteresting(*AssumedUpperBound, BR)) {
481  // Even if the byte offset isn't interesting (e.g. it's a constant value),
482  // the assumption can still be interesting if it provides information
483  // about an interesting symbolic upper bound.
484  ShouldReportNonNegative = false;
485  } else {
486  // We don't have anything interesting, don't report the assumption.
487  return "";
488  }
489  }
490 
491  std::optional<int64_t> OffsetN = getConcreteValue(ByteOffsetVal);
492  std::optional<int64_t> ExtentN = getConcreteValue(AssumedUpperBound);
493 
494  const bool UseIndex =
495  ElementSize && tryDividePair(OffsetN, ExtentN, *ElementSize);
496 
497  SmallString<256> Buf;
498  llvm::raw_svector_ostream Out(Buf);
499  Out << "Assuming ";
500  if (UseIndex) {
501  Out << "index ";
502  if (OffsetN)
503  Out << "'" << OffsetN << "' ";
504  } else if (AssumedUpperBound) {
505  Out << "byte offset ";
506  if (OffsetN)
507  Out << "'" << OffsetN << "' ";
508  } else {
509  Out << "offset ";
510  }
511 
512  Out << "is";
513  if (ShouldReportNonNegative) {
514  Out << " non-negative";
515  }
516  if (AssumedUpperBound) {
517  if (ShouldReportNonNegative)
518  Out << " and";
519  Out << " less than ";
520  if (ExtentN)
521  Out << *ExtentN << ", ";
522  if (UseIndex && ElementType)
523  Out << "the number of '" << ElementType->getAsString()
524  << "' elements in ";
525  else
526  Out << "the extent of ";
527  Out << getRegionName(Reg);
528  }
529  return std::string(Out.str());
530 }
531 
532 bool StateUpdateReporter::providesInformationAboutInteresting(
534  if (!Sym)
535  return false;
536  for (SymbolRef PartSym : Sym->symbols()) {
537  // The interestingess mark may appear on any layer as we're stripping off
538  // the SymIntExpr, UnarySymExpr etc. layers...
539  if (BR.isInteresting(PartSym))
540  return true;
541  // ...but if both sides of the expression are symbolic, then there is no
542  // practical algorithm to produce separate constraints for the two
543  // operands (from the single combined result).
544  if (isa<SymSymExpr>(PartSym))
545  return false;
546  }
547  return false;
548 }
549 
550 void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
551  const SVal Location = C.getSVal(E);
552 
553  // The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
554  // #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
555  // and incomplete analysis of these leads to false positives. As even
556  // accurate reports would be confusing for the users, just disable reports
557  // from these macros:
558  if (isFromCtypeMacro(E, C.getASTContext()))
559  return;
560 
561  ProgramStateRef State = C.getState();
562  SValBuilder &SVB = C.getSValBuilder();
563 
564  const std::optional<std::pair<const SubRegion *, NonLoc>> &RawOffset =
565  computeOffset(State, SVB, Location);
566 
567  if (!RawOffset)
568  return;
569 
570  auto [Reg, ByteOffset] = *RawOffset;
571 
572  // The state updates will be reported as a single note tag, which will be
573  // composed by this helper class.
574  StateUpdateReporter SUR(Reg, ByteOffset, E, C);
575 
576  // CHECK LOWER BOUND
577  const MemSpaceRegion *Space = Reg->getMemorySpace();
578  if (!(isa<SymbolicRegion>(Reg) && isa<UnknownSpaceRegion>(Space))) {
579  // A symbolic region in unknown space represents an unknown pointer that
580  // may point into the middle of an array, so we don't look for underflows.
581  // Both conditions are significant because we want to check underflows in
582  // symbolic regions on the heap (which may be introduced by checkers like
583  // MallocChecker that call SValBuilder::getConjuredHeapSymbolVal()) and
584  // non-symbolic regions (e.g. a field subregion of a symbolic region) in
585  // unknown space.
586  auto [PrecedesLowerBound, WithinLowerBound] = compareValueToThreshold(
587  State, ByteOffset, SVB.makeZeroArrayIndex(), SVB);
588 
589  if (PrecedesLowerBound) {
590  // The offset may be invalid (negative)...
591  if (!WithinLowerBound) {
592  // ...and it cannot be valid (>= 0), so report an error.
593  Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
594  reportOOB(C, PrecedesLowerBound, Msgs, ByteOffset, std::nullopt);
595  return;
596  }
597  // ...but it can be valid as well, so the checker will (optimistically)
598  // assume that it's valid and mention this in the note tag.
599  SUR.recordNonNegativeAssumption();
600  }
601 
602  // Actually update the state. The "if" only fails in the extremely unlikely
603  // case when compareValueToThreshold returns {nullptr, nullptr} becasue
604  // evalBinOpNN fails to evaluate the less-than operator.
605  if (WithinLowerBound)
606  State = WithinLowerBound;
607  }
608 
609  // CHECK UPPER BOUND
611  if (auto KnownSize = Size.getAs<NonLoc>()) {
612  // In a situation where both overflow and overflow are possible (but the
613  // index is either tainted or known to be invalid), the logic of this
614  // checker will first assume that the offset is non-negative, and then
615  // (with this additional assumption) it will detect an overflow error.
616  // In this situation the warning message should mention both possibilities.
617  bool AlsoMentionUnderflow = SUR.assumedNonNegative();
618 
619  auto [WithinUpperBound, ExceedsUpperBound] =
620  compareValueToThreshold(State, ByteOffset, *KnownSize, SVB);
621 
622  if (ExceedsUpperBound) {
623  // The offset may be invalid (>= Size)...
624  if (!WithinUpperBound) {
625  // ...and it cannot be within bounds, so report an error, unless we can
626  // definitely determine that this is an idiomatic `&array[size]`
627  // expression that calculates the past-the-end pointer.
628  if (isIdiomaticPastTheEndPtr(E, ExceedsUpperBound, ByteOffset,
629  *KnownSize, C)) {
630  C.addTransition(ExceedsUpperBound, SUR.createNoteTag(C));
631  return;
632  }
633 
634  Messages Msgs =
635  getExceedsMsgs(C.getASTContext(), Reg, ByteOffset, *KnownSize,
636  Location, AlsoMentionUnderflow);
637  reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize);
638  return;
639  }
640  // ...and it can be valid as well...
641  if (isTainted(State, ByteOffset)) {
642  // ...but it's tainted, so report an error.
643 
644  // Diagnostic detail: saying "tainted offset" is always correct, but
645  // the common case is that 'idx' is tainted in 'arr[idx]' and then it's
646  // nicer to say "tainted index".
647  const char *OffsetName = "offset";
648  if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
649  if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
650  OffsetName = "index";
651 
652  Messages Msgs = getTaintMsgs(Reg, OffsetName, AlsoMentionUnderflow);
653  reportOOB(C, ExceedsUpperBound, Msgs, ByteOffset, KnownSize,
654  /*IsTaintBug=*/true);
655  return;
656  }
657  // ...and it isn't tainted, so the checker will (optimistically) assume
658  // that the offset is in bounds and mention this in the note tag.
659  SUR.recordUpperBoundAssumption(*KnownSize);
660  }
661 
662  // Actually update the state. The "if" only fails in the extremely unlikely
663  // case when compareValueToThreshold returns {nullptr, nullptr} becasue
664  // evalBinOpNN fails to evaluate the less-than operator.
665  if (WithinUpperBound)
666  State = WithinUpperBound;
667  }
668 
669  // Add a transition, reporting the state updates that we accumulated.
670  C.addTransition(State, SUR.createNoteTag(C));
671 }
672 
673 void ArrayBoundCheckerV2::markPartsInteresting(PathSensitiveBugReport &BR,
674  ProgramStateRef ErrorState,
675  NonLoc Val, bool MarkTaint) {
676  if (SymbolRef Sym = Val.getAsSymbol()) {
677  // If the offset is a symbolic value, iterate over its "parts" with
678  // `SymExpr::symbols()` and mark each of them as interesting.
679  // For example, if the offset is `x*4 + y` then we put interestingness onto
680  // the SymSymExpr `x*4 + y`, the SymIntExpr `x*4` and the two data symbols
681  // `x` and `y`.
682  for (SymbolRef PartSym : Sym->symbols())
683  BR.markInteresting(PartSym);
684  }
685 
686  if (MarkTaint) {
687  // If the issue that we're reporting depends on the taintedness of the
688  // offset, then put interestingness onto symbols that could be the origin
689  // of the taint. Note that this may find symbols that did not appear in
690  // `Sym->symbols()` (because they're only loosely connected to `Val`).
691  for (SymbolRef Sym : getTaintedSymbols(ErrorState, Val))
692  BR.markInteresting(Sym);
693  }
694 }
695 
696 void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
697  ProgramStateRef ErrorState, Messages Msgs,
698  NonLoc Offset, std::optional<NonLoc> Extent,
699  bool IsTaintBug /*=false*/) const {
700 
701  ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
702  if (!ErrorNode)
703  return;
704 
705  auto BR = std::make_unique<PathSensitiveBugReport>(
706  IsTaintBug ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
707 
708  // FIXME: ideally we would just call trackExpressionValue() and that would
709  // "do the right thing": mark the relevant symbols as interesting, track the
710  // control dependencies and statements storing the relevant values and add
711  // helpful diagnostic pieces. However, right now trackExpressionValue() is
712  // a heap of unreliable heuristics, so it would cause several issues:
713  // - Interestingness is not applied consistently, e.g. if `array[x+10]`
714  // causes an overflow, then `x` is not marked as interesting.
715  // - We get irrelevant diagnostic pieces, e.g. in the code
716  // `int *p = (int*)malloc(2*sizeof(int)); p[3] = 0;`
717  // it places a "Storing uninitialized value" note on the `malloc` call
718  // (which is technically true, but irrelevant).
719  // If trackExpressionValue() becomes reliable, it should be applied instead
720  // of this custom markPartsInteresting().
721  markPartsInteresting(*BR, ErrorState, Offset, IsTaintBug);
722  if (Extent)
723  markPartsInteresting(*BR, ErrorState, *Extent, IsTaintBug);
724 
725  C.emitReport(std::move(BR));
726 }
727 
728 bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
729  SourceLocation Loc = S->getBeginLoc();
730  if (!Loc.isMacroID())
731  return false;
732 
733  StringRef MacroName = Lexer::getImmediateMacroName(
734  Loc, ACtx.getSourceManager(), ACtx.getLangOpts());
735 
736  if (MacroName.size() < 7 || MacroName[0] != 'i' || MacroName[1] != 's')
737  return false;
738 
739  return ((MacroName == "isalnum") || (MacroName == "isalpha") ||
740  (MacroName == "isblank") || (MacroName == "isdigit") ||
741  (MacroName == "isgraph") || (MacroName == "islower") ||
742  (MacroName == "isnctrl") || (MacroName == "isprint") ||
743  (MacroName == "ispunct") || (MacroName == "isspace") ||
744  (MacroName == "isupper") || (MacroName == "isxdigit"));
745 }
746 
747 bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
748  ParentMapContext &ParentCtx = ACtx.getParentMapContext();
749  do {
750  const DynTypedNodeList Parents = ParentCtx.getParents(*S);
751  if (Parents.empty())
752  return false;
753  S = Parents[0].get<Stmt>();
754  } while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
755  const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
756  return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
757 }
758 
759 bool ArrayBoundCheckerV2::isIdiomaticPastTheEndPtr(const Expr *E,
761  NonLoc Offset, NonLoc Limit,
762  CheckerContext &C) {
763  if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
764  auto [EqualsToThreshold, NotEqualToThreshold] = compareValueToThreshold(
765  State, Offset, Limit, C.getSValBuilder(), /*CheckEquality=*/true);
766  return EqualsToThreshold && !NotEqualToThreshold;
767  }
768  return false;
769 }
770 
771 void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
772  mgr.registerChecker<ArrayBoundCheckerV2>();
773 }
774 
775 bool ento::shouldRegisterArrayBoundCheckerV2(const CheckerManager &mgr) {
776  return true;
777 }
static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region, NonLoc Offset, NonLoc Extent, SVal Location, bool AlsoMentionUnderflow)
static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName, bool AlsoMentionUnderflow)
static bool isNegative(SValBuilder &SVB, ProgramStateRef State, NonLoc Value)
static std::string getRegionName(const SubRegion *Region)
static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset)
static bool isUnsigned(SValBuilder &SVB, NonLoc Value)
static std::optional< std::pair< const SubRegion *, NonLoc > > computeOffset(ProgramStateRef State, SValBuilder &SVB, SVal Location)
For a given Location that can be represented as a symbolic expression ArrIdx, return the parent memor...
static std::pair< NonLoc, nonloc::ConcreteInt > getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent, SValBuilder &svalBuilder)
static std::pair< ProgramStateRef, ProgramStateRef > compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold, SValBuilder &SVB, bool CheckEquality=false)
static bool tryDividePair(std::optional< int64_t > &Val1, std::optional< int64_t > &Val2, int64_t Divisor)
Try to divide Val1 and Val2 (in place) by Divisor and return true if it can be performed (Divisor is ...
static std::optional< int64_t > getConcreteValue(NonLoc SV)
llvm::APSInt APSInt
unsigned Offset
Definition: Format.cpp:2978
LineState State
Holds long-lived AST nodes (such as types and decls) that can be referred to throughout the semantic ...
Definition: ASTContext.h:185
ParentMapContext & getParentMapContext()
Returns the dynamic AST node parent map context.
Definition: ASTContext.cpp:851
SourceManager & getSourceManager()
Definition: ASTContext.h:708
const LangOptions & getLangOpts() const
Definition: ASTContext.h:778
CharUnits getTypeSizeInChars(QualType T) const
Return the size of the specified (complete) type T, in characters.
ArraySubscriptExpr - [C99 6.5.2.1] Array Subscripting.
Definition: Expr.h:2716
QuantityType getQuantity() const
getQuantity - Get the raw integer representation of this quantity.
Definition: CharUnits.h:185
Container for either a single DynTypedNode or for an ArrayRef to DynTypedNode.
This represents one expression.
Definition: Expr.h:110
static StringRef getImmediateMacroName(SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts)
Retrieve the name of the immediate macro expansion.
Definition: Lexer.cpp:1060
MemberExpr - [C99 6.5.2.3] Structure and Union Members.
Definition: Expr.h:3224
Expr * getBase() const
Definition: Expr.h:3301
bool isArrow() const
Definition: Expr.h:3408
DynTypedNodeList getParents(const NodeT &Node)
Returns the parents of the given node (within the traversal scope).
A (possibly-)qualified type.
Definition: Type.h:940
static std::string getAsString(SplitQualType split, const PrintingPolicy &Policy)
Definition: Type.h:1327
Encodes a location in the source.
Stmt - This represents one statement.
Definition: Stmt.h:84
bool isIncompleteType(NamedDecl **Def=nullptr) const
Types are partitioned into 3 broad categories (C99 6.2.5p1): object types, function types,...
Definition: Type.cpp:2361
bool isUnsignedIntegerType() const
Return true if this is an integer type that is unsigned, according to C99 6.2.5p6 [which returns true...
Definition: Type.cpp:2195
UnaryOperator - This represents the unary-expression's (except sizeof and alignof),...
Definition: Expr.h:2235
Opcode getOpcode() const
Definition: Expr.h:2275
QualType getType() const
Definition: Value.cpp:234
A record of the "type" of an APSInt, used for conversions.
Definition: APSIntType.h:19
llvm::APSInt convert(const llvm::APSInt &Value) const LLVM_READONLY
Convert and return a new APSInt with the given value, but this type's bit width and signedness.
Definition: APSIntType.h:48
Template implementation for all binary symbolic expressions.
CHECKER * registerChecker(AT &&... Args)
Used to register checkers.
ElementRegion is used to represent both array elements and casts.
Definition: MemRegion.h:1194
QualType getElementType() const
Definition: MemRegion.h:1218
NonLoc getIndex() const
Definition: MemRegion.h:1214
MemRegion - The root abstract class for all memory regions.
Definition: MemRegion.h:96
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemSpaceRegion * getMemorySpace() const
Definition: MemRegion.cpp:1317
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * StripCasts(bool StripBaseAndDerivedCasts=true) const
Definition: MemRegion.cpp:1378
std::string getDescriptiveName(bool UseQuotes=true) const
Get descriptive name for memory region.
Definition: MemRegion.cpp:707
const RegionTy * getAs() const
Definition: MemRegion.h:1383
MemSpaceRegion - A memory region that represents a "memory space"; for example, the set of global var...
Definition: MemRegion.h:203
The tag upon which the TagVisitor reacts.
Definition: BugReporter.h:779
void markInteresting(SymbolRef sym, bugreporter::TrackingKind TKind=bugreporter::TrackingKind::Thorough)
Marks a symbol as interesting.
bool isInteresting(SymbolRef sym) const
virtual const llvm::APSInt * getMaxValue(ProgramStateRef state, SVal val)=0
Tries to get the maximal possible (integer) value of a given SVal.
ASTContext & getContext()
Definition: SValBuilder.h:148
NonLoc makeArrayIndex(uint64_t idx)
Definition: SValBuilder.h:284
nonloc::ConcreteInt makeIntVal(const IntegerLiteral *integer)
Definition: SValBuilder.h:290
QualType getArrayIndexType() const
Definition: SValBuilder.h:157
virtual SVal evalBinOpNN(ProgramStateRef state, BinaryOperator::Opcode op, NonLoc lhs, NonLoc rhs, QualType resultTy)=0
Create a new value which represents a binary expression with two non- location operands.
QualType getConditionType() const
Definition: SValBuilder.h:153
SVal - This represents a symbolic expression, which can be either an L-value or an R-value.
Definition: SVals.h:55
SymbolRef getAsSymbol(bool IncludeBaseRegions=false) const
If this SVal wraps a symbol return that SymbolRef.
Definition: SVals.cpp:104
const MemRegion * getAsRegion() const
Definition: SVals.cpp:120
std::optional< T > getAs() const
Convert to the specified SVal type, returning std::nullopt if this SVal is not of the desired type.
Definition: SVals.h:86
SubRegion - A region that subsets another larger region.
Definition: MemRegion.h:441
LLVM_ATTRIBUTE_RETURNS_NONNULL const MemRegion * getSuperRegion() const
Definition: MemRegion.h:454
Symbolic value.
Definition: SymExpr.h:30
llvm::iterator_range< symbol_iterator > symbols() const
Definition: SymExpr.h:87
Value representing integer constant.
Definition: SVals.h:297
const llvm::APSInt & getValue() const
Definition: SVals.h:301
Represents symbolic expression that isn't a location.
Definition: SVals.h:276
std::vector< SymbolRef > getTaintedSymbols(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Returns the tainted Symbols for a given Statement and state.
Definition: Taint.cpp:169
bool isTainted(ProgramStateRef State, const Stmt *S, const LocationContext *LCtx, TaintTagType Kind=TaintTagGeneric)
Check if the statement has a tainted value in the given state.
Definition: Taint.cpp:147
DefinedOrUnknownSVal getDynamicExtent(ProgramStateRef State, const MemRegion *MR, SValBuilder &SVB)
@ Full
This outputs the full clang module dependency graph suitable for use for explicitly building modules.
The JSON file list parser is used to communicate input to InstallAPI.
BinaryOperatorKind
const FunctionProtoType * T
long int64_t