From 22b4226078b041a16bf05163347a66ab4dbcf3a5 Mon Sep 17 00:00:00 2001
From: Eric Brown <ericwb@users.noreply.github.com>
Date: Fri, 8 Mar 2024 07:19:40 -0800
Subject: [PATCH] Add a SARIF output formatter (#1113)

This commit adds a formatter that outputs JSON in a specific
SARIF format according to spec at [1].

This code is largely leveraged from an existing implementation
found here [2].

SARIF format is very useful for integration into ecosystems such
as GitHub's Actions.

[1] https://docs.oasis-open.org/sarif/sarif/v2.1.0/cs01/sarif-v2.1.0-cs01.html
[2] https://github.com/microsoft/bandit-sarif-formatter

Closes #646

Signed-off-by: Eric Brown <eric_wade_brown@yahoo.com>
---
 bandit/__init__.py                  |   1 +
 bandit/formatters/sarif.py          | 372 ++++++++++++++++++++++++++++
 doc/source/formatters/sarif.rst     |   5 +
 doc/source/man/bandit.rst           |   2 +-
 doc/source/start.rst                |   7 +
 setup.cfg                           |   4 +
 tests/unit/formatters/test_sarif.py | 139 +++++++++++
 tox.ini                             |   1 +
 8 files changed, 530 insertions(+), 1 deletion(-)
 create mode 100644 bandit/formatters/sarif.py
 create mode 100644 doc/source/formatters/sarif.rst
 create mode 100644 tests/unit/formatters/test_sarif.py

diff --git a/bandit/__init__.py b/bandit/__init__.py
index 75f863db2..7c7bf00a8 100644
--- a/bandit/__init__.py
+++ b/bandit/__init__.py
@@ -16,4 +16,5 @@
 from bandit.core.issue import *  # noqa
 from bandit.core.test_properties import *  # noqa
 
+__author__ = metadata.metadata("bandit")["Author"]
 __version__ = metadata.version("bandit")
diff --git a/bandit/formatters/sarif.py b/bandit/formatters/sarif.py
new file mode 100644
index 000000000..ce2f03b7b
--- /dev/null
+++ b/bandit/formatters/sarif.py
@@ -0,0 +1,372 @@
+# Copyright (c) Microsoft.  All Rights Reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Note: this code mostly incorporated from
+# https://github.com/microsoft/bandit-sarif-formatter
+#
+r"""
+===============
+SARIF formatter
+===============
+
+This formatter outputs the issues in SARIF formatted JSON.
+
+:Example:
+
+.. code-block:: javascript
+
+    {
+      "runs": [
+        {
+          "tool": {
+            "driver": {
+              "name": "Bandit",
+              "organization": "PyCQA",
+              "rules": [
+                {
+                  "id": "B101",
+                  "name": "assert_used",
+                  "properties": {
+                    "tags": [
+                      "security",
+                      "external/cwe/cwe-703"
+                    ],
+                    "precision": "high"
+                  },
+                  "helpUri": "https://bandit.readthedocs.io/en/1.7.8/plugins/b101_assert_used.html"
+                }
+              ],
+              "version": "1.7.8",
+              "semanticVersion": "1.7.8"
+            }
+          },
+          "invocations": [
+            {
+              "executionSuccessful": true,
+              "endTimeUtc": "2024-03-05T03:28:48Z"
+            }
+          ],
+          "properties": {
+            "metrics": {
+              "_totals": {
+                "loc": 1,
+                "nosec": 0,
+                "skipped_tests": 0,
+                "SEVERITY.UNDEFINED": 0,
+                "CONFIDENCE.UNDEFINED": 0,
+                "SEVERITY.LOW": 1,
+                "CONFIDENCE.LOW": 0,
+                "SEVERITY.MEDIUM": 0,
+                "CONFIDENCE.MEDIUM": 0,
+                "SEVERITY.HIGH": 0,
+                "CONFIDENCE.HIGH": 1
+              },
+              "./examples/assert.py": {
+                "loc": 1,
+                "nosec": 0,
+                "skipped_tests": 0,
+                "SEVERITY.UNDEFINED": 0,
+                "SEVERITY.LOW": 1,
+                "SEVERITY.MEDIUM": 0,
+                "SEVERITY.HIGH": 0,
+                "CONFIDENCE.UNDEFINED": 0,
+                "CONFIDENCE.LOW": 0,
+                "CONFIDENCE.MEDIUM": 0,
+                "CONFIDENCE.HIGH": 1
+              }
+            }
+          },
+          "results": [
+            {
+              "message": {
+                "text": "Use of assert detected. The enclosed code will be removed when compiling to optimised byte code."
+              },
+              "level": "note",
+              "locations": [
+                {
+                  "physicalLocation": {
+                    "region": {
+                      "snippet": {
+                        "text": "assert True\n"
+                      },
+                      "endColumn": 11,
+                      "endLine": 1,
+                      "startColumn": 0,
+                      "startLine": 1
+                    },
+                    "artifactLocation": {
+                      "uri": "examples/assert.py"
+                    },
+                    "contextRegion": {
+                      "snippet": {
+                        "text": "assert True\n"
+                      },
+                      "endLine": 1,
+                      "startLine": 1
+                    }
+                  }
+                }
+              ],
+              "properties": {
+                "issue_confidence": "HIGH",
+                "issue_severity": "LOW"
+              },
+              "ruleId": "B101",
+              "ruleIndex": 0
+            }
+          ]
+        }
+      ],
+      "version": "2.1.0",
+      "$schema": "https://json.schemastore.org/sarif-2.1.0.json"
+    }
+
+.. versionadded:: 1.7.8
+
+"""  # noqa: E501
+import logging
+import pathlib
+import sys
+import urllib.parse as urlparse
+from datetime import datetime
+
+import sarif_om as om
+from jschema_to_python.to_json import to_json
+
+import bandit
+from bandit.core import docs_utils
+
+LOG = logging.getLogger(__name__)
+SCHEMA_URI = "https://json.schemastore.org/sarif-2.1.0.json"
+SCHEMA_VER = "2.1.0"
+TS_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+
+
+def report(manager, fileobj, sev_level, conf_level, lines=-1):
+    """Prints issues in SARIF format
+
+    :param manager: the bandit manager object
+    :param fileobj: The output file object, which may be sys.stdout
+    :param sev_level: Filtering severity level
+    :param conf_level: Filtering confidence level
+    :param lines: Number of lines to report, -1 for all
+    """
+
+    log = om.SarifLog(
+        schema_uri=SCHEMA_URI,
+        version=SCHEMA_VER,
+        runs=[
+            om.Run(
+                tool=om.Tool(
+                    driver=om.ToolComponent(
+                        name="Bandit",
+                        organization=bandit.__author__,
+                        semantic_version=bandit.__version__,
+                        version=bandit.__version__,
+                    )
+                ),
+                invocations=[
+                    om.Invocation(
+                        end_time_utc=datetime.utcnow().strftime(TS_FORMAT),
+                        execution_successful=True,
+                    )
+                ],
+                properties={"metrics": manager.metrics.data},
+            )
+        ],
+    )
+
+    run = log.runs[0]
+    invocation = run.invocations[0]
+
+    skips = manager.get_skipped()
+    add_skipped_file_notifications(skips, invocation)
+
+    issues = manager.get_issue_list(sev_level=sev_level, conf_level=conf_level)
+
+    add_results(issues, run)
+
+    serializedLog = to_json(log)
+
+    with fileobj:
+        fileobj.write(serializedLog)
+
+    if fileobj.name != sys.stdout.name:
+        LOG.info("SARIF output written to file: %s", fileobj.name)
+
+
+def add_skipped_file_notifications(skips, invocation):
+    if skips is None or len(skips) == 0:
+        return
+
+    if invocation.tool_configuration_notifications is None:
+        invocation.tool_configuration_notifications = []
+
+    for skip in skips:
+        (file_name, reason) = skip
+
+        notification = om.Notification(
+            level="error",
+            message=om.Message(text=reason),
+            locations=[
+                om.Location(
+                    physical_location=om.PhysicalLocation(
+                        artifact_location=om.ArtifactLocation(
+                            uri=to_uri(file_name)
+                        )
+                    )
+                )
+            ],
+        )
+
+        invocation.tool_configuration_notifications.append(notification)
+
+
+def add_results(issues, run):
+    if run.results is None:
+        run.results = []
+
+    rules = {}
+    rule_indices = {}
+    for issue in issues:
+        result = create_result(issue, rules, rule_indices)
+        run.results.append(result)
+
+    if len(rules) > 0:
+        run.tool.driver.rules = list(rules.values())
+
+
+def create_result(issue, rules, rule_indices):
+    issue_dict = issue.as_dict()
+
+    rule, rule_index = create_or_find_rule(issue_dict, rules, rule_indices)
+
+    physical_location = om.PhysicalLocation(
+        artifact_location=om.ArtifactLocation(
+            uri=to_uri(issue_dict["filename"])
+        )
+    )
+
+    add_region_and_context_region(
+        physical_location,
+        issue_dict["line_range"],
+        issue_dict["col_offset"],
+        issue_dict["end_col_offset"],
+        issue_dict["code"],
+    )
+
+    return om.Result(
+        rule_id=rule.id,
+        rule_index=rule_index,
+        message=om.Message(text=issue_dict["issue_text"]),
+        level=level_from_severity(issue_dict["issue_severity"]),
+        locations=[om.Location(physical_location=physical_location)],
+        properties={
+            "issue_confidence": issue_dict["issue_confidence"],
+            "issue_severity": issue_dict["issue_severity"],
+        },
+    )
+
+
+def level_from_severity(severity):
+    if severity == "HIGH":
+        return "error"
+    elif severity == "MEDIUM":
+        return "warning"
+    elif severity == "LOW":
+        return "note"
+    else:
+        return "warning"
+
+
+def add_region_and_context_region(
+    physical_location, line_range, col_offset, end_col_offset, code
+):
+    if code:
+        first_line_number, snippet_lines = parse_code(code)
+        snippet_line = snippet_lines[line_range[0] - first_line_number]
+        snippet = om.ArtifactContent(text=snippet_line)
+    else:
+        snippet = None
+
+    physical_location.region = om.Region(
+        start_line=line_range[0],
+        end_line=line_range[1] if len(line_range) > 1 else line_range[0],
+        start_column=col_offset + 1,
+        end_column=end_col_offset + 1,
+        snippet=snippet,
+    )
+
+    if code:
+        physical_location.context_region = om.Region(
+            start_line=first_line_number,
+            end_line=first_line_number + len(snippet_lines) - 1,
+            snippet=om.ArtifactContent(text="".join(snippet_lines)),
+        )
+
+
+def parse_code(code):
+    code_lines = code.split("\n")
+
+    # The last line from the split has nothing in it; it's an artifact of the
+    # last "real" line ending in a newline. Unless, of course, it doesn't:
+    last_line = code_lines[len(code_lines) - 1]
+
+    last_real_line_ends_in_newline = False
+    if len(last_line) == 0:
+        code_lines.pop()
+        last_real_line_ends_in_newline = True
+
+    snippet_lines = []
+    first_line_number = 0
+    first = True
+    for code_line in code_lines:
+        number_and_snippet_line = code_line.split(" ", 1)
+        if first:
+            first_line_number = int(number_and_snippet_line[0])
+            first = False
+
+        snippet_line = number_and_snippet_line[1] + "\n"
+        snippet_lines.append(snippet_line)
+
+    if not last_real_line_ends_in_newline:
+        last_line = snippet_lines[len(snippet_lines) - 1]
+        snippet_lines[len(snippet_lines) - 1] = last_line[: len(last_line) - 1]
+
+    return first_line_number, snippet_lines
+
+
+def create_or_find_rule(issue_dict, rules, rule_indices):
+    rule_id = issue_dict["test_id"]
+    if rule_id in rules:
+        return rules[rule_id], rule_indices[rule_id]
+
+    rule = om.ReportingDescriptor(
+        id=rule_id,
+        name=issue_dict["test_name"],
+        help_uri=docs_utils.get_url(rule_id),
+        properties={
+            "tags": [
+                "security",
+                f"external/cwe/cwe-{issue_dict['issue_cwe'].get('id')}",
+            ],
+            "precision": issue_dict["issue_confidence"].lower(),
+        },
+    )
+
+    index = len(rules)
+    rules[rule_id] = rule
+    rule_indices[rule_id] = index
+    return rule, index
+
+
+def to_uri(file_path):
+    pure_path = pathlib.PurePath(file_path)
+    if pure_path.is_absolute():
+        return pure_path.as_uri()
+    else:
+        # Replace backslashes with slashes.
+        posix_path = pure_path.as_posix()
+        # %-encode special characters.
+        return urlparse.quote(posix_path)
diff --git a/doc/source/formatters/sarif.rst b/doc/source/formatters/sarif.rst
new file mode 100644
index 000000000..58b9633a7
--- /dev/null
+++ b/doc/source/formatters/sarif.rst
@@ -0,0 +1,5 @@
+-----
+sarif
+-----
+
+.. automodule:: bandit.formatters.sarif
diff --git a/doc/source/man/bandit.rst b/doc/source/man/bandit.rst
index 46125e613..eef10d271 100644
--- a/doc/source/man/bandit.rst
+++ b/doc/source/man/bandit.rst
@@ -44,7 +44,7 @@ OPTIONS
                         (-l for LOW, -ll for MEDIUM, -lll for HIGH)
   -i, --confidence      report only issues of a given confidence level or
                         higher (-i for LOW, -ii for MEDIUM, -iii for HIGH)
-  -f {csv,custom,html,json,screen,txt,xml,yaml}, --format {csv,custom,html,json,screen,txt,xml,yaml}
+  -f {csv,custom,html,json,sarif,screen,txt,xml,yaml}, --format {csv,custom,html,json,sarif,screen,txt,xml,yaml}
                         specify output format
   --msg-template MSG_TEMPLATE
                         specify output message template (only usable with
diff --git a/doc/source/start.rst b/doc/source/start.rst
index 069ec7108..cd8f3dadf 100644
--- a/doc/source/start.rst
+++ b/doc/source/start.rst
@@ -38,6 +38,13 @@ extras:
 
     pip install bandit[baseline]
 
+If you want to include SARIF output formatter support, install it with the
+`sarif` extras:
+
+.. code-block:: console
+
+    pip install bandit[sarif]
+
 Run Bandit:
 
 .. code-block:: console
diff --git a/setup.cfg b/setup.cfg
index 54d4096a2..2dbee597c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,6 +37,9 @@ toml =
     tomli>=1.1.0; python_version < "3.11"
 baseline =
     GitPython>=3.1.30
+sarif =
+    sarif-om>=1.0.4
+    jschema-to-python>=1.2.3
 
 [entry_points]
 console_scripts =
@@ -52,6 +55,7 @@ bandit.formatters =
     txt = bandit.formatters.text:report
     xml = bandit.formatters.xml:report
     html = bandit.formatters.html:report
+    sarif = bandit.formatters.sarif:report
     screen = bandit.formatters.screen:report
     yaml = bandit.formatters.yaml:report
     custom = bandit.formatters.custom:report
diff --git a/tests/unit/formatters/test_sarif.py b/tests/unit/formatters/test_sarif.py
new file mode 100644
index 000000000..a5306fa81
--- /dev/null
+++ b/tests/unit/formatters/test_sarif.py
@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+import collections
+import json
+import tempfile
+from unittest import mock
+
+import testtools
+
+import bandit
+from bandit.core import config
+from bandit.core import constants
+from bandit.core import issue
+from bandit.core import manager
+from bandit.core import metrics
+from bandit.formatters import sarif
+
+
+class SarifFormatterTests(testtools.TestCase):
+    def setUp(self):
+        super().setUp()
+        conf = config.BanditConfig()
+        self.manager = manager.BanditManager(conf, "file")
+        (tmp_fd, self.tmp_fname) = tempfile.mkstemp()
+        self.context = {
+            "filename": self.tmp_fname,
+            "lineno": 4,
+            "linerange": [4],
+            "code": (
+                "import socket\n\n"
+                "s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n"
+                "s.bind(('0.0.0.0', 31137))"
+            ),
+        }
+        self.check_name = "hardcoded_bind_all_interfaces"
+        self.issue = issue.Issue(
+            severity=bandit.MEDIUM,
+            cwe=issue.Cwe.MULTIPLE_BINDS,
+            confidence=bandit.MEDIUM,
+            text="Possible binding to all interfaces.",
+            test_id="B104",
+        )
+
+        self.candidates = [
+            issue.Issue(
+                issue.Cwe.MULTIPLE_BINDS,
+                bandit.LOW,
+                bandit.LOW,
+                "Candidate A",
+                lineno=1,
+            ),
+            issue.Issue(
+                bandit.HIGH,
+                issue.Cwe.MULTIPLE_BINDS,
+                bandit.HIGH,
+                "Candiate B",
+                lineno=2,
+            ),
+        ]
+
+        self.manager.out_file = self.tmp_fname
+
+        self.issue.fname = self.context["filename"]
+        self.issue.lineno = self.context["lineno"]
+        self.issue.linerange = self.context["linerange"]
+        self.issue.code = self.context["code"]
+        self.issue.test = self.check_name
+
+        self.manager.results.append(self.issue)
+        self.manager.metrics = metrics.Metrics()
+
+        # mock up the metrics
+        for key in ["_totals", "binding.py"]:
+            self.manager.metrics.data[key] = {"loc": 4, "nosec": 2}
+            for criteria, default in constants.CRITERIA:
+                for rank in constants.RANKING:
+                    self.manager.metrics.data[key][f"{criteria}.{rank}"] = 0
+
+    @mock.patch("bandit.core.manager.BanditManager.get_issue_list")
+    def test_report(self, get_issue_list):
+        self.manager.files_list = ["binding.py"]
+        self.manager.scores = [
+            {
+                "SEVERITY": [0] * len(constants.RANKING),
+                "CONFIDENCE": [0] * len(constants.RANKING),
+            }
+        ]
+
+        get_issue_list.return_value = collections.OrderedDict(
+            [(self.issue, self.candidates)]
+        )
+
+        with open(self.tmp_fname, "w") as tmp_file:
+            sarif.report(
+                self.manager,
+                tmp_file,
+                self.issue.severity,
+                self.issue.confidence,
+            )
+
+        with open(self.tmp_fname) as f:
+            data = json.loads(f.read())
+            run = data["runs"][0]
+            self.assertEqual(sarif.SCHEMA_URI, data["$schema"])
+            self.assertEqual(sarif.SCHEMA_VER, data["version"])
+            driver = run["tool"]["driver"]
+            self.assertEqual("Bandit", driver["name"])
+            self.assertEqual(bandit.__author__, driver["organization"])
+            self.assertEqual(bandit.__version__, driver["semanticVersion"])
+            self.assertEqual("B104", driver["rules"][0]["id"])
+            self.assertEqual(self.check_name, driver["rules"][0]["name"])
+            self.assertIn("security", driver["rules"][0]["properties"]["tags"])
+            self.assertIn(
+                "external/cwe/cwe-605",
+                driver["rules"][0]["properties"]["tags"],
+            )
+            self.assertEqual(
+                "medium", driver["rules"][0]["properties"]["precision"]
+            )
+            invocation = run["invocations"][0]
+            self.assertTrue(invocation["executionSuccessful"])
+            self.assertIsNotNone(invocation["endTimeUtc"])
+            result = run["results"][0]
+            # If the level is "warning" like in this case, SARIF will remove
+            # from output, as "warning" is the default value.
+            self.assertIsNone(result.get("level"))
+            self.assertEqual(self.issue.text, result["message"]["text"])
+            physicalLocation = result["locations"][0]["physicalLocation"]
+            self.assertEqual(
+                self.context["linerange"][0],
+                physicalLocation["region"]["startLine"],
+            )
+            self.assertEqual(
+                self.context["linerange"][0],
+                physicalLocation["region"]["endLine"],
+            )
+            self.assertIn(
+                self.tmp_fname,
+                physicalLocation["artifactLocation"]["uri"],
+            )
diff --git a/tox.ini b/tox.ini
index 27b3d75e7..13e3458de 100644
--- a/tox.ini
+++ b/tox.ini
@@ -14,6 +14,7 @@ extras =
     yaml
     toml
     baseline
+    sarif
 commands =
     find bandit -type f -name "*.pyc" -delete
     stestr run {posargs}