Skip to content

sarif: initial implementation of csdiff fingerprints #168

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 34 additions & 1 deletion src/csgrep.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "abstract-filter.hh"
#include "filter.hh"
#include "finger-print.hh"
#include "msg-filter.hh"
#include "parser.hh"
#include "parser-common.hh"
Expand Down Expand Up @@ -232,6 +233,36 @@ class ImpLevelFilter: public AbstractFilter {
}
};

class FingerPrintFilter: public AbstractFilter {
private:
const std::string hashPrefix_;

public:
FingerPrintFilter(AbstractWriter *agent, const std::string &hashPrefix):
AbstractFilter(agent),
hashPrefix_(hashPrefix)
{
}

protected:
bool matchDef(const Defect &def) override {
const FingerPrinter fp(def);
std::string hash = fp.getHash(FPV_CSDIFF_WITH_LINE_CONTENT);
if (hash.empty())
// fingerprint not available for this finding
return false;

const size_t prefixLen = hashPrefix_.size();
if (hash.size() < prefixLen)
// the prefix we are looking for is longer than the hash itself
return false;

// make size of the hash equal to size of the prefix and compare
hash.resize(prefixLen);
return (hashPrefix_ == hash);
}
};

class KeyEventPredicate: public IPredicate {
private:
const RE re_;
Expand Down Expand Up @@ -543,7 +574,8 @@ bool chainFilters(
return false;
}

return chainDecoratorIntArg<ImpLevelFilter>(pEng, vm, "imp-level");
return chainDecoratorGeneric<FingerPrintFilter>(pEng, vm, "hash-v1")
&& chainDecoratorIntArg<ImpLevelFilter> (pEng, vm, "imp-level");
}

int main(int argc, char *argv[])
Expand All @@ -565,6 +597,7 @@ int main(int argc, char *argv[])
("path", po::value<string>(), "defect matches if the path of its key event matches the given regex")
("event", po::value<string>(), "defect matches if its key event matches the given regex (each defect has exactly one key event, which determines its location in the code)")
("error", po::value<string>(), "defect matches if the message of its key event matches the given regex")
("hash-v1", po::value<string>(), "defect matches if its csdiff/v1 fingerprint starts with the given prefix")
("msg", po::value<string>(), "defect matches if any of its messages matches the given regex")
("tool", po::value<string>(), "defect matches if it was detected by tool that matches the given regex")
("annot", po::value<string>(), "defect matches if its annotation matches the given regex")
Expand Down
1 change: 1 addition & 0 deletions src/lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_library(cs STATIC
cwe-name-lookup.cc
deflookup.cc
filter.cc
finger-print.cc
instream.cc
msg-filter.cc
parser.cc
Expand Down
105 changes: 77 additions & 28 deletions src/lib/deflookup.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@

#include "deflookup.hh"

#include "finger-print.hh"
#include "msg-filter.hh"
#include "parser.hh"

#include <cassert>
#include <map>

typedef std::vector<Defect> TDefList;
Expand All @@ -31,7 +33,7 @@ typedef std::map<std::string, TDefByEvt> TDefByFile;
typedef std::map<std::string, TDefByFile> TDefByChecker;

struct DefLookup::Private {
TDefByChecker stor;
TDefByChecker byChecker;
bool usePartialResults;
};

Expand Down Expand Up @@ -63,64 +65,111 @@ DefLookup::~DefLookup()

void DefLookup::hashDefect(const Defect &def)
{
TDefByFile &row = d->stor[def.checker];
// categorize by checker
TDefByFile &byPath = d->byChecker[def.checker];

// categorize by path
const DefEvent &evt = def.events[def.keyEventIdx];
const MsgFilter &filter = MsgFilter::inst();
TDefByEvt &col = row[filter.filterPath(evt.fileName)];
TDefByMsg &zCol = col[evt.event];
TDefList &cell = zCol[filter.filterMsg(evt.msg, def.checker)];
TDefByEvt &byEvt = byPath[filter.filterPath(evt.fileName)];

cell.push_back(def);
// categorize by key event and msg
TDefByMsg &byMsg = byEvt[evt.event];
TDefList &defList = byMsg[filter.filterMsg(evt.msg, def.checker)];

defList.push_back(def);
}

static bool defLookupCore(TDefList &defList, const Defect &lookFor)
{
// look by line content without spaces if available
const std::string lineCont = FingerPrinter(lookFor).getLineContent();
if (!lineCont.empty()) {
bool fullLineContCoverage = true;

for (auto it = defList.begin(); it != defList.end(); ++it) {
const std::string lineContNow = FingerPrinter(*it).getLineContent();
if (lineContNow.empty())
fullLineContCoverage = false;
else if (lineCont == lineContNow) {
// matched by line content without spaces
defList.erase(it);
return true;
}
}

if (fullLineContCoverage)
// we had line content for all lines but none of them matched
return false;
}

// just remove an arbitrary one
// TODO: add some other criteria in order to make the match more precise
defList.resize(defList.size() - 1U);

return true;
}

bool DefLookup::lookup(const Defect &def)
{
// look for defect class
TDefByChecker::iterator iRow = d->stor.find(def.checker);
if (d->stor.end() == iRow)
TDefByChecker::iterator itByChecker = d->byChecker.find(def.checker);
if (d->byChecker.end() == itByChecker)
return false;

// simplify path
const MsgFilter &filter = MsgFilter::inst();
const DefEvent &evt = def.events[def.keyEventIdx];
const std::string path(filter.filterPath(evt.fileName));
const std::string path = filter.filterPath(evt.fileName);

// look for file name
TDefByFile &row = iRow->second;
TDefByFile::iterator iCol = row.find(path);
if (row.end() == iCol)
TDefByFile &byPath = itByChecker->second;
assert(!byPath.empty());
TDefByFile::iterator itByPath = byPath.find(path);
if (byPath.end() == itByPath)
return false;

TDefByEvt &col = iCol->second;
if (!d->usePartialResults && col.end() != col.find("internal warning"))
TDefByEvt &byEvt = itByPath->second;
assert(!byEvt.empty());
if (!d->usePartialResults && byEvt.end() != byEvt.find("internal warning"))
// if the analyzer produced an "internal warning" diagnostic message,
// we assume partial results, which cannot be reliably used for
// differential scan ==> pretend we found what we had been looking
// for, but do not remove anything from the store
return true;

// look by key event
TDefByEvt::iterator iZCol = col.find(evt.event);
if (col.end() == iZCol)
TDefByEvt::iterator itByEvent = byEvt.find(evt.event);
if (byEvt.end() == itByEvent)
return false;

// look by msg
TDefByMsg &zCol = iZCol->second;
TDefByMsg::iterator iCell = zCol.find(
filter.filterMsg(evt.msg, def.checker));
if (zCol.end() == iCell)
TDefByMsg &byMsg = itByEvent->second;
assert(!byMsg.empty());
const std::string msg = filter.filterMsg(evt.msg, def.checker);
TDefByMsg::iterator itByMsg = byMsg.find(msg);
if (byMsg.end() == itByMsg)
return false;

// FIXME: nasty over-approximation
TDefList &defs = iCell->second;
unsigned cnt = defs.size();
if (cnt)
// just remove an arbitrary one
defs.resize(cnt - 1);
else
// process the resulting list of defects sequentially
TDefList &defList = itByMsg->second;
assert(!defList.empty());
if (!defLookupCore(defList, def))
return false;

// TODO: add some other criteria in order to make the match more precise
// remove empty maps to speed up subsequent lookups
if (defList.empty()) {
byMsg.erase(itByMsg);
if (byMsg.empty()) {
byEvt.erase(itByEvent);
if (byEvt.empty()) {
byPath.erase(itByPath);
if (byPath.empty())
d->byChecker.erase(itByChecker);
}
}
}

// found!
return true;
}
139 changes: 139 additions & 0 deletions src/lib/finger-print.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
/*
* Copyright (C) 2024 Red Hat, Inc.
*
* This file is part of csdiff.
*
* csdiff is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* any later version.
*
* csdiff is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with csdiff. If not, see <http://www.gnu.org/licenses/>.
*/

#include "finger-print.hh"

#include "hash-util.hh"
#include "msg-filter.hh"
#include "parser-common.hh" // for parseInt()
#include "regex.hh"

#include <cassert>

#include <boost/uuid/name_generator.hpp> // for boost::uuids::detail::sha1

static std::string findLineContent(const int lineNumber, const TEvtList &evts)
{
// line content for the key event as produced by `csgrep --embed-context`
static const RE reLineCont = RE("^ *([0-9]+) *\\|-> (.*)$");
assert(0 < lineNumber);

// go through all events
for (const DefEvent &evt : evts) {
if ("#" != evt.event)
// not a comment
continue;

boost::smatch sm;
if (!boost::regex_match(evt.msg, sm, reLineCont))
// not a line content
continue;

if (lineNumber != parseInt(sm[/* line number */ 1]))
// line number mismatch
continue;

// found!
return sm[/* line content */ 2];
}

return /* not found */ "";
}

static void readLineContent(std::string *pDst, const Defect &def)
{
const DefEvent &keyEvt = def.events[def.keyEventIdx];
if (keyEvt.line <= 0)
// no valid line number for the key event
return;

std::string content = findLineContent(keyEvt.line, def.events);
if (content.empty())
// no line content found
return;

// remove all white-spaces
static const RE reSpace = RE("\\s+");
*pDst = boost::regex_replace(content, reSpace, "");
}

/// return SHA1 hash of `str` as hex-encoded string
static inline std::string computeHexSHA1(const std::string &str)
{
using boost::uuids::detail::sha1;
return hexHashStr<sha1>(str);
}

struct FingerPrinter::Private {
std::string basicData;
std::string lineContent;
};

// separator used for hashing of data composed from multiple strings
static const std::string sep = "\n";

// TODO: consider lazy evaluation of basicData/lineContent
FingerPrinter::FingerPrinter(const Defect &def):
d(new Private)
{
// filter that csdiff uses to drop details insignificant for matching
const MsgFilter &filt = MsgFilter::inst();

// read and transform file path
const DefEvent &keyEvt = def.events[def.keyEventIdx];
const std::string path =
filt.filterPath(keyEvt.fileName, /* forceFullPath */ true);

// initialize basicData by taking all that DefLookup::lookup() looks at
d->basicData =
/* checker */ def.checker + sep +
/* file path */ path + sep +
/* key event */ keyEvt.event + sep +
/* message */ filt.filterMsg(keyEvt.msg, def.checker);

// try to read line content without white-spaces
readLineContent(&d->lineContent, def);
}

FingerPrinter::~FingerPrinter() = default;

// TODO: consider caching of SHA1 hashes for subsequent calls
std::string FingerPrinter::getHash(const EFingerPrintVer fpv) const
{
if (d->basicData.empty())
// not enough data to compute the hash from
return "";

if (fpv == FPV_CSDIFF)
// return SHA1 hash from basicData
return computeHexSHA1(d->basicData);

assert(fpv == FPV_CSDIFF_WITH_LINE_CONTENT);
if (d->lineContent.empty())
// no line content available
return "";

// return SHA1 hash from basicData AND lineContent
return computeHexSHA1(d->basicData + sep + d->lineContent);
}

std::string FingerPrinter::getLineContent() const
{
return d->lineContent;
}
Loading