[analyzer] Switch to PostStmt callbacks in ArrayBoundV2 (#72107)

...instead of the currently used, more abstract Location callback. The
main advantage of this change is that after it the checker will check
`array[index].field` while the previous implementation ignored this
situation (because here the ElementRegion is wrapped in a FieldRegion
object). This improvement fixes PR #70187.

Note that after this change `&array[idx]` will be handled as an access
to the `idx`th element of `array`, which is technically incorrect but
matches the programmer intuitions. In my opinion it's more helpful if
the report points to the source location where the indexing happens
(instead of the location where a pointer is finally dereferenced).

As a special case, this change allows code that forms the past-the-end
pointer of an array as `&arr[size]` (but still rejects code like
`if (idx >= size) return &array[idx];` and code that dereferences a
past-the-end pointer).

In addition to this primary improvement, this change tweaks the message
for the tainted index/offset case (using the more concrete information
that's available now) and clarifies/improves a few testcases.

The main change of this commit (replacing `check::Location` with
`check::PostStmt<...>` callbacks) was already proposed in my change
https://reviews.llvm.org/D150446 and https://reviews.llvm.org/D159107 by
steakhal. Those reviews were both abandoned, but the problems that led
to abandonment were unrelated to the change that is introduced in this
PR.
This commit is contained in:
DonatNagyE 2023-12-05 16:17:35 +01:00 committed by GitHub
parent 9e4210faf2
commit dfdedaf6da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 235 additions and 47 deletions

View File

@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "clang/AST/CharUnits.h"
#include "clang/AST/ParentMapContext.h"
#include "clang/StaticAnalyzer/Checkers/BuiltinCheckerRegistration.h"
#include "clang/StaticAnalyzer/Checkers/Taint.h"
#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
@ -34,20 +35,46 @@ using llvm::formatv;
namespace {
enum OOB_Kind { OOB_Precedes, OOB_Exceeds, OOB_Taint };
class ArrayBoundCheckerV2 :
public Checker<check::Location> {
struct Messages {
std::string Short, Full;
};
// NOTE: The `ArraySubscriptExpr` and `UnaryOperator` callbacks are `PostStmt`
// instead of `PreStmt` because the current implementation passes the whole
// expression to `CheckerContext::getSVal()` which only works after the
// symbolic evaluation of the expression. (To turn them into `PreStmt`
// callbacks, we'd need to duplicate the logic that evaluates these
// expressions.) The `MemberExpr` callback would work as `PreStmt` but it's
// defined as `PostStmt` for the sake of consistency with the other callbacks.
class ArrayBoundCheckerV2 : public Checker<check::PostStmt<ArraySubscriptExpr>,
check::PostStmt<UnaryOperator>,
check::PostStmt<MemberExpr>> {
BugType BT{this, "Out-of-bound access"};
BugType TaintBT{this, "Out-of-bound access", categories::TaintedData};
void performCheck(const Expr *E, CheckerContext &C) const;
void reportOOB(CheckerContext &C, ProgramStateRef ErrorState, OOB_Kind Kind,
NonLoc Offset, std::string RegName, std::string Msg) const;
NonLoc Offset, Messages Msgs) const;
static bool isFromCtypeMacro(const Stmt *S, ASTContext &AC);
static bool isInAddressOf(const Stmt *S, ASTContext &AC);
public:
void checkLocation(SVal l, bool isLoad, const Stmt *S,
CheckerContext &C) const;
void checkPostStmt(const ArraySubscriptExpr *E, CheckerContext &C) const {
performCheck(E, C);
}
void checkPostStmt(const UnaryOperator *E, CheckerContext &C) const {
if (E->getOpcode() == UO_Deref)
performCheck(E, C);
}
void checkPostStmt(const MemberExpr *E, CheckerContext &C) const {
if (E->isArrow())
performCheck(E->getBase(), C);
}
};
} // anonymous namespace
/// For a given Location that can be represented as a symbolic expression
@ -149,9 +176,11 @@ getSimplifiedOffsets(NonLoc offset, nonloc::ConcreteInt extent,
// where the first one corresponds to "value below threshold" and the second
// corresponds to "value at or above threshold". Returns {nullptr, nullptr} in
// the case when the evaluation fails.
// If the optional argument CheckEquality is true, then use BO_EQ instead of
// the default BO_LT after consistently applying the same simplification steps.
static std::pair<ProgramStateRef, ProgramStateRef>
compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
SValBuilder &SVB) {
SValBuilder &SVB, bool CheckEquality = false) {
if (auto ConcreteThreshold = Threshold.getAs<nonloc::ConcreteInt>()) {
std::tie(Value, Threshold) = getSimplifiedOffsets(Value, *ConcreteThreshold, SVB);
}
@ -167,8 +196,10 @@ compareValueToThreshold(ProgramStateRef State, NonLoc Value, NonLoc Threshold,
return {nullptr, State};
}
}
const BinaryOperatorKind OpKind = CheckEquality ? BO_EQ : BO_LT;
auto BelowThreshold =
SVB.evalBinOpNN(State, BO_LT, Value, Threshold, SVB.getConditionType()).getAs<NonLoc>();
SVB.evalBinOpNN(State, OpKind, Value, Threshold, SVB.getConditionType())
.getAs<NonLoc>();
if (BelowThreshold)
return State->assume(*BelowThreshold);
@ -217,16 +248,19 @@ static std::string getShortMsg(OOB_Kind Kind, std::string RegName) {
return formatv(ShortMsgTemplates[Kind], RegName);
}
static std::string getPrecedesMsg(std::string RegName, NonLoc Offset) {
static Messages getPrecedesMsgs(const SubRegion *Region, NonLoc Offset) {
std::string RegName = getRegionName(Region);
SmallString<128> Buf;
llvm::raw_svector_ostream Out(Buf);
Out << "Access of " << RegName << " at negative byte offset";
if (auto ConcreteIdx = Offset.getAs<nonloc::ConcreteInt>())
Out << ' ' << ConcreteIdx->getValue();
return std::string(Buf);
return {getShortMsg(OOB_Precedes, RegName), std::string(Buf)};
}
static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
NonLoc Offset, NonLoc Extent, SVal Location) {
static Messages getExceedsMsgs(ASTContext &ACtx, const SubRegion *Region,
NonLoc Offset, NonLoc Extent, SVal Location) {
std::string RegName = getRegionName(Region);
const auto *EReg = Location.getAsRegion()->getAs<ElementRegion>();
assert(EReg && "this checker only handles element access");
QualType ElemType = EReg->getElementType();
@ -273,20 +307,18 @@ static std::string getExceedsMsg(ASTContext &ACtx, std::string RegName,
Out << "s";
}
return std::string(Buf);
}
static std::string getTaintMsg(std::string RegName) {
SmallString<128> Buf;
llvm::raw_svector_ostream Out(Buf);
Out << "Access of " << RegName
<< " with a tainted offset that may be too large";
return std::string(Buf);
return {getShortMsg(OOB_Exceeds, RegName), std::string(Buf)};
}
void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
const Stmt *LoadS,
CheckerContext &C) const {
static Messages getTaintMsgs(const SubRegion *Region, const char *OffsetName) {
std::string RegName = getRegionName(Region);
return {formatv("Potential out of bound access to {0} with tainted {1}",
RegName, OffsetName),
formatv("Access of {0} with a tainted {1} that may be too large",
RegName, OffsetName)};
}
void ArrayBoundCheckerV2::performCheck(const Expr *E, CheckerContext &C) const {
// NOTE: Instead of using ProgramState::assumeInBound(), we are prototyping
// some new logic here that reasons directly about memory region extents.
// Once that logic is more mature, we can bring it back to assumeInBound()
@ -297,12 +329,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
// have some flexibility in defining the base region, we can achieve
// various levels of conservatism in our buffer overflow checking.
const SVal Location = C.getSVal(E);
// The header ctype.h (from e.g. glibc) implements the isXXXXX() macros as
// #define isXXXXX(arg) (LOOKUP_TABLE[arg] & BITMASK_FOR_XXXXX)
// and incomplete analysis of these leads to false positives. As even
// accurate reports would be confusing for the users, just disable reports
// from these macros:
if (isFromCtypeMacro(LoadS, C.getASTContext()))
if (isFromCtypeMacro(E, C.getASTContext()))
return;
ProgramStateRef State = C.getState();
@ -331,9 +365,8 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
if (PrecedesLowerBound && !WithinLowerBound) {
// We know that the index definitely precedes the lower bound.
std::string RegName = getRegionName(Reg);
std::string Msg = getPrecedesMsg(RegName, ByteOffset);
reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, RegName, Msg);
Messages Msgs = getPrecedesMsgs(Reg, ByteOffset);
reportOOB(C, PrecedesLowerBound, OOB_Precedes, ByteOffset, Msgs);
return;
}
@ -350,17 +383,38 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
if (ExceedsUpperBound) {
if (!WithinUpperBound) {
// We know that the index definitely exceeds the upper bound.
std::string RegName = getRegionName(Reg);
std::string Msg = getExceedsMsg(C.getASTContext(), RegName, ByteOffset,
*KnownSize, Location);
reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, RegName, Msg);
if (isa<ArraySubscriptExpr>(E) && isInAddressOf(E, C.getASTContext())) {
// ...but this is within an addressof expression, so we need to check
// for the exceptional case that `&array[size]` is valid.
auto [EqualsToThreshold, NotEqualToThreshold] =
compareValueToThreshold(ExceedsUpperBound, ByteOffset, *KnownSize,
SVB, /*CheckEquality=*/true);
if (EqualsToThreshold && !NotEqualToThreshold) {
// We are definitely in the exceptional case, so return early
// instead of reporting a bug.
C.addTransition(EqualsToThreshold);
return;
}
}
Messages Msgs = getExceedsMsgs(C.getASTContext(), Reg, ByteOffset,
*KnownSize, Location);
reportOOB(C, ExceedsUpperBound, OOB_Exceeds, ByteOffset, Msgs);
return;
}
if (isTainted(State, ByteOffset)) {
// Both cases are possible, but the index is tainted, so report.
// Both cases are possible, but the offset is tainted, so report.
std::string RegName = getRegionName(Reg);
std::string Msg = getTaintMsg(RegName);
reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, RegName, Msg);
// Diagnostic detail: "tainted offset" is always correct, but the
// common case is that 'idx' is tainted in 'arr[idx]' and then it's
// nicer to say "tainted index".
const char *OffsetName = "offset";
if (const auto *ASE = dyn_cast<ArraySubscriptExpr>(E))
if (isTainted(State, ASE->getIdx(), C.getLocationContext()))
OffsetName = "index";
Messages Msgs = getTaintMsgs(Reg, OffsetName);
reportOOB(C, ExceedsUpperBound, OOB_Taint, ByteOffset, Msgs);
return;
}
}
@ -374,17 +428,14 @@ void ArrayBoundCheckerV2::checkLocation(SVal Location, bool IsLoad,
void ArrayBoundCheckerV2::reportOOB(CheckerContext &C,
ProgramStateRef ErrorState, OOB_Kind Kind,
NonLoc Offset, std::string RegName,
std::string Msg) const {
NonLoc Offset, Messages Msgs) const {
ExplodedNode *ErrorNode = C.generateErrorNode(ErrorState);
if (!ErrorNode)
return;
std::string ShortMsg = getShortMsg(Kind, RegName);
auto BR = std::make_unique<PathSensitiveBugReport>(
Kind == OOB_Taint ? TaintBT : BT, ShortMsg, Msg, ErrorNode);
Kind == OOB_Taint ? TaintBT : BT, Msgs.Short, Msgs.Full, ErrorNode);
// Track back the propagation of taintedness.
if (Kind == OOB_Taint)
@ -413,6 +464,18 @@ bool ArrayBoundCheckerV2::isFromCtypeMacro(const Stmt *S, ASTContext &ACtx) {
(MacroName == "isupper") || (MacroName == "isxdigit"));
}
bool ArrayBoundCheckerV2::isInAddressOf(const Stmt *S, ASTContext &ACtx) {
ParentMapContext &ParentCtx = ACtx.getParentMapContext();
do {
const DynTypedNodeList Parents = ParentCtx.getParents(*S);
if (Parents.empty())
return false;
S = Parents[0].get<Stmt>();
} while (isa_and_nonnull<ParenExpr, ImplicitCastExpr>(S));
const auto *UnaryOp = dyn_cast_or_null<UnaryOperator>(S);
return UnaryOp && UnaryOp->getOpcode() == UO_AddrOf;
}
void ento::registerArrayBoundCheckerV2(CheckerManager &mgr) {
mgr.registerChecker<ArrayBoundCheckerV2>();
}

View File

@ -9,6 +9,14 @@ void arrayUnderflow(void) {
// expected-note@-2 {{Access of 'array' at negative byte offset -12}}
}
int underflowWithDeref(void) {
int *p = array;
--p;
return *p;
// expected-warning@-1 {{Out of bound access to memory preceding 'array'}}
// expected-note@-2 {{Access of 'array' at negative byte offset -4}}
}
int scanf(const char *restrict fmt, ...);
void taintedIndex(void) {
@ -17,6 +25,38 @@ void taintedIndex(void) {
// expected-note@-1 {{Taint originated here}}
// expected-note@-2 {{Taint propagated to the 2nd argument}}
array[index] = 5;
// expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}}
// expected-note@-2 {{Access of 'array' with a tainted index that may be too large}}
}
int *taintedIndexAfterTheEndPtr(void) {
// NOTE: Technically speaking, this testcase does not trigger any UB because
// &array[10] is the after-the-end pointer which is well-defined; but this is
// a bug-prone situation and far from the idiomatic use of `&array[size]`, so
// it's better to report an error. This report can be easily silenced by
// writing array+index instead of &array[index].
int index;
scanf("%d", &index);
// expected-note@-1 {{Taint originated here}}
// expected-note@-2 {{Taint propagated to the 2nd argument}}
if (index < 0 || index > 10)
return array;
// expected-note@-2 {{Assuming 'index' is >= 0}}
// expected-note@-3 {{Left side of '||' is false}}
// expected-note@-4 {{Assuming 'index' is <= 10}}
// expected-note@-5 {{Taking false branch}}
return &array[index];
// expected-warning@-1 {{Potential out of bound access to 'array' with tainted index}}
// expected-note@-2 {{Access of 'array' with a tainted index that may be too large}}
}
void taintedOffset(void) {
int index;
scanf("%d", &index);
// expected-note@-1 {{Taint originated here}}
// expected-note@-2 {{Taint propagated to the 2nd argument}}
int *p = array + index;
p[0] = 5;
// expected-warning@-1 {{Potential out of bound access to 'array' with tainted offset}}
// expected-note@-2 {{Access of 'array' with a tainted offset that may be too large}}
}
@ -27,6 +67,47 @@ void arrayOverflow(void) {
// expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
}
void flippedOverflow(void) {
12[array] = 5;
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at index 12, while it holds only 10 'int' elements}}
}
int *afterTheEndPtr(void) {
// This is an unusual but standard-compliant way of writing (array + 10).
return &array[10]; // no-warning
}
int useAfterTheEndPtr(void) {
// ... but dereferencing the after-the-end pointer is still invalid.
return *afterTheEndPtr();
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at index 10, while it holds only 10 'int' elements}}
}
int *afterAfterTheEndPtr(void) {
// This is UB, it's invalid to form an after-after-the-end pointer.
return &array[11];
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at index 11, while it holds only 10 'int' elements}}
}
int *potentialAfterTheEndPtr(int idx) {
if (idx < 10) { /* ...do something... */ }
// expected-note@-1 {{Assuming 'idx' is >= 10}}
// expected-note@-2 {{Taking false branch}}
return &array[idx];
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at an overflowing index, while it holds only 10 'int' elements}}
// NOTE: On the idx >= 10 branch the normal "optimistic" behavior would've
// been continuing with the assumption that idx == 10 and the return value is
// a legitimate after-the-end pointer. The checker deviates from this by
// reporting an error because this situation is very suspicious and far from
// the idiomatic `&array[size]` expressions. If the report is FP, the
// developer can easily silence it by writing array+idx instead of
// &array[idx].
}
int scalar;
int scalarOverflow(void) {
return (&scalar)[1];
@ -41,12 +122,6 @@ int oneElementArrayOverflow(void) {
// expected-note@-2 {{Access of 'oneElementArray' at index 1, while it holds only a single 'int' element}}
}
short convertedArray(void) {
return ((short*)array)[47];
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
}
struct vec {
int len;
double elems[64];
@ -64,6 +139,28 @@ double arrayInStructPtr(struct vec *pv) {
// expected-note@-2 {{Access of the field 'elems' at index 64, while it holds only 64 'double' elements}}
}
struct item {
int a, b;
} itemArray[20] = {0};
int arrayOfStructs(void) {
return itemArray[35].a;
// expected-warning@-1 {{Out of bound access to memory after the end of 'itemArray'}}
// expected-note@-2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
}
int arrayOfStructsArrow(void) {
return (itemArray + 35)->b;
// expected-warning@-1 {{Out of bound access to memory after the end of 'itemArray'}}
// expected-note@-2 {{Access of 'itemArray' at index 35, while it holds only 20 'struct item' elements}}
}
short convertedArray(void) {
return ((short*)array)[47];
// expected-warning@-1 {{Out of bound access to memory after the end of 'array'}}
// expected-note@-2 {{Access of 'array' at index 47, while it holds only 20 'short' elements}}
}
struct two_bytes {
char lo, hi;
};
@ -85,7 +182,9 @@ int intFromString(void) {
}
int intFromStringDivisible(void) {
// However, this is reported with indices/elements, because the extent happens to be a multiple of 4.
// However, this is reported with indices/elements, because the extent
// (of the string that consists of 'a', 'b', 'c' and '\0') happens to be a
// multiple of 4 bytes (= sizeof(int)).
return ((const int*)"abc")[20];
// expected-warning@-1 {{Out of bound access to memory after the end of the string literal}}
// expected-note@-2 {{Access of the string literal at index 20, while it holds only a single 'int' element}}

View File

@ -154,3 +154,29 @@ void test_dynamic_size2(unsigned m,unsigned n){
unsigned *U = nullptr;
U = new unsigned[m + n + 1];
}
//Test creating invalid references, which break the invariant that a reference
//is always holding a value, and could lead to nasty runtime errors.
//(This is not related to operator new, but placed in this file because the
//other test files are not C++.)
int array[10] = {0};
void test_after_the_end_reference() {
int &ref = array[10]; // expected-warning{{Out of bound access to memory}}
}
void test_after_after_the_end_reference() {
int &ref = array[11]; // expected-warning{{Out of bound access to memory}}
}
int test_reference_that_might_be_after_the_end(int idx) {
// This TC produces no warning because separate analysis of (idx == 10) is
// only introduced _after_ the creation of the reference ref.
if (idx < 0 || idx > 10)
return -2;
int &ref = array[idx];
if (idx == 10)
return -1;
return ref;
}

View File

@ -29,8 +29,8 @@ int taintDiagnosticOutOfBound(void) {
int Array[] = {1, 2, 3, 4, 5};
scanf("%d", &index); // expected-note {{Taint originated here}}
// expected-note@-1 {{Taint propagated to the 2nd argument}}
return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted offset}}
// expected-note@-1 {{Access of 'Array' with a tainted offset that may be too large}}
return Array[index]; // expected-warning {{Potential out of bound access to 'Array' with tainted index}}
// expected-note@-1 {{Access of 'Array' with a tainted index that may be too large}}
}
int taintDiagnosticDivZero(int operand) {