Skip to content

Scancode: Fix false positive reported by scancode output analyser script #13745

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Oct 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,18 @@ matrix:
| ( grep -v '^tools/test/toolchains/api_test.py' || true ) \
| while read file; do cp --parents "${file}" SCANCODE; done
- scancode -l --json-pp scancode.json SCANCODE
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode.json || true
- python ./tools/test/travis-ci/scancode-evaluate.py scancode.json || true
# run the same but for new files. All new files must have SPDX
- >-
git diff --name-only --diff-filter=A FETCH_HEAD..HEAD \
| ( grep '.\(c\|cpp\|h\|hpp\|py\)$' || true ) \
| ( grep -v '^tools/test/toolchains/api_test.py' || true ) \
| while read file; do cp --parents "${file}" SCANCODE_NEW_FILES; done
- scancode -l --json-pp scancode_new_files.json SCANCODE_NEW_FILES
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode_new_files.json || true
- python ./tools/test/travis-ci/scancode-evaluate.py scancode_new_files.json || true
- cat scancode-evaluate.log
- COUNT=$(cat scancode-evaluate.log | grep 'File:' | wc -l) || true
- python ./tools/test/travis-ci/scancode-evaluate.py -f scancode_new_files.json
- python ./tools/test/travis-ci/scancode-evaluate.py scancode_new_files.json
- cat scancode-evaluate.log
- COUNT_NEW_FILES=$(cat scancode-evaluate.log | grep 'File:' | wc -l) || true
- |
Expand Down
197 changes: 114 additions & 83 deletions tools/test/travis-ci/scancode-evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,118 +16,149 @@
limitations
"""

# Asumptions for this script:
# 1. directory_name is scanned directory.
# Files are copied to this directory with full tree. As result, if we find
# license offender, we can have full path (just scrape directory_name). We do this
# magic because scancode allows to scan directories/one file.
# 2. SPDX and license text is a must for all code files

import json
import argparse
import sys
import os.path
import json
import logging
import os.path
import re

userlog = logging.getLogger("scancode-evaluate")
userlog.setLevel(logging.INFO)
logfile = os.path.join(os.getcwd(), 'scancode-evaluate.log')
log_file_handler = logging.FileHandler(logfile, mode='w')
userlog.addHandler(log_file_handler)
import sys
from enum import Enum

MISSING_LICENSE_TEXT = "Missing license header"
MISSING_PERMISIVE_LICENSE_TEXT = "Non-permissive license"
MISSING_PERMISSIVE_LICENSE_TEXT = "Non-permissive license"
MISSING_SPDX_TEXT = "Missing SPDX license identifier"

def license_check(directory_name, file):
""" Check licenses in the scancode json file for specified directory
userlog = logging.getLogger("scancode-evaluate")

class ReturnCode(Enum):
"""Return codes."""

SUCCESS = 0
ERROR = -1


def init_logger():
"""Initialise the logger."""
userlog.setLevel(logging.INFO)
userlog.addHandler(
logging.FileHandler(
os.path.join(os.getcwd(), 'scancode-evaluate.log'), mode='w'
)
)


def path_leaf(path):
"""Return the leaf of a path."""
head, tail = os.path.split(path)
# Ensure the correct file name is returned if the file ends with a slash
return tail or os.path.basename(head)


def has_permissive_text_in_scancode_output(scancode_output_data_file_licenses):
"""Returns true if at list one license in the scancode output is permissive."""
return any(
scancode_output_data_file_license['category'] == 'Permissive'
for scancode_output_data_file_license in scancode_output_data_file_licenses
)


def has_spdx_text_in_scancode_output(scancode_output_data_file_licenses):
"""Returns true if at least one license in the scancode output has the spdx identifier."""
return any(
'spdx' in scancode_output_data_file_license['matched_rule']['identifier']
for scancode_output_data_file_license in scancode_output_data_file_licenses
)


def has_spdx_text_in_analysed_file(scanned_file_content):
"""Returns true if the file analysed by ScanCode contains SPDX identifier."""
return bool(re.findall("SPDX-License-Identifier:?", scanned_file_content))


def license_check(scancode_output_path):
"""Check licenses in the scancode json file for specified directory.

This function does not verify if file exists, should be done prior the call.

Args:
directory_name - where scancode was run, used to scrape this from paths
file - scancode json output file (output from scancode --license --json-pp)
Args:
scancode_output_path: path to the scancode json output file (output from scancode --license --json-pp)

Returns:
Returns:
0 if nothing found
>0 - count how many license isses found
-1 if any error in file licenses found
ReturnCode.ERROR.value if any error in file licenses found
"""

offenders = []
try:
# find all licenses in the files, must be licensed and permissive
with open(file, 'r') as scancode_output:
results = json.load(scancode_output)
except ValueError:
userlog.warning("JSON could not be decoded")
return -1

try:
for file in results['files']:
license_offender = {}
license_offender['file'] = file
# ignore directory, not relevant here
if license_offender['file']['type'] == 'directory':
continue
if not license_offender['file']['licenses']:
license_offender['reason'] = MISSING_LICENSE_TEXT
offenders.append(license_offender)
with open(scancode_output_path, 'r') as read_file:
scancode_output_data = json.load(read_file)
except json.JSONDecodeError as jex:
userlog.warning("JSON could not be decoded, Invalid JSON in body: %s", jex)
return ReturnCode.ERROR.value

if 'files' not in scancode_output_data:
userlog.warning("Missing `files` attribute in %s" % (scancode_output_path))
return ReturnCode.ERROR.value

for scancode_output_data_file in scancode_output_data['files']:
if scancode_output_data_file['type'] != 'file':
continue

if not scancode_output_data_file['licenses']:
scancode_output_data_file['fail_reason'] = MISSING_LICENSE_TEXT
offenders.append(scancode_output_data_file)
# check the next file in the scancode output
continue

if not has_permissive_text_in_scancode_output(scancode_output_data_file['licenses']):
scancode_output_data_file['fail_reason'] = MISSING_PERMISSIVE_LICENSE_TEXT
offenders.append(scancode_output_data_file)

if not has_spdx_text_in_scancode_output(scancode_output_data_file['licenses']):
# Scancode does not recognize license notice in Python file headers.
# Issue: https://github.com/nexB/scancode-toolkit/issues/1913
# Therefore check if the file tested by ScanCode actually has a licence notice.
file_path = os.path.abspath(scancode_output_data_file['path'])
try:
with open(file_path, 'r') as read_file:
scanned_file_content = read_file.read()
except UnicodeDecodeError:
userlog.warning("Unable to look for SPDX text in `{}`:".format(file_path))
# Ignore files that cannot be decoded
# check the next file in the scancode output
continue

found_spdx = False
for i in range(len(license_offender['file']['licenses'])):
if license_offender['file']['licenses'][i]['category'] != 'Permissive':
license_offender['reason'] = MISSING_PERMISIVE_LICENSE_TEXT
offenders.append(license_offender)
# find SPDX, it shall be one of licenses found
if license_offender['file']['licenses'][i]['matched_rule']['identifier'].find("spdx") != -1:
found_spdx = True

if not found_spdx:
try:
# Issue reported here https://github.com/nexB/scancode-toolkit/issues/1913
# We verify here if SPDX is not really there as SDPX is part of the license text
# scancode has some problems detecting it properly
with open(os.path.join(os.path.abspath(license_offender['file']['path'])), 'r') as spdx_file_check:
filetext = spdx_file_check.read()
matches = re.findall("SPDX-License-Identifier:?", filetext)
if matches:
continue
license_offender['reason'] = MISSING_SPDX_TEXT
offenders.append(license_offender)
except UnicodeDecodeError:
# not valid file for license check
continue
except KeyError:
userlog.warning("Invalid scancode json file")
return -1
if not has_spdx_text_in_analysed_file(scanned_file_content):
scancode_output_data_file['fail_reason'] = MISSING_SPDX_TEXT
offenders.append(scancode_output_data_file)

if offenders:
userlog.warning("Found files with missing license details, please review and fix")
for offender in offenders:
userlog.warning("File: " + offender['file']['path'][len(directory_name):] + " " + "reason: " + offender['reason'])
userlog.warning("File: %s reason: %s" % (path_leaf(offender['path']), offender['fail_reason']))
return len(offenders)


def parse_args():
parser = argparse.ArgumentParser(
description="License check.")
parser.add_argument('-f', '--file',
help="scancode-toolkit output json file")
parser.add_argument('-d', '--directory_name', default="SCANCODE",
help='Directory name where are files being checked')
"""Parse command line arguments."""
parser = argparse.ArgumentParser(description="License check.")
parser.add_argument(
'scancode_output_path',
help="scancode-toolkit output json file"
)
return parser.parse_args()

if __name__ == "__main__":

if __name__ == "__main__":
init_logger()
args = parse_args()
if args.file and os.path.isfile(args.file):
count = license_check(args.directory_name, args.file)
if count == 0:
sys.exit(0)
else:
sys.exit(-1)
if os.path.isfile(args.scancode_output_path):
sys.exit(
ReturnCode.SUCCESS.value
if license_check(args.scancode_output_path) == 0
else ReturnCode.ERROR.value
)
else:
userlog.warning("Could not find the scancode json file")
sys.exit(-1)
sys.exit(ReturnCode.ERROR.value)
95 changes: 95 additions & 0 deletions tools/test/travis-ci/scancode_evaluate_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/env python
# Copyright (c) 2020 Arm Limited and Contributors. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
import importlib
import os
import pytest

license_check = importlib.import_module("scancode-evaluate").license_check

STUBS_PATH = os.path.join(
os.path.abspath(os.path.join(os.path.dirname(__file__))), "scancode_test"
)

HEADER_WITHOUT_SPDX = "/* Copyright (C) Arm Limited, Inc - All Rights Reserved\
* Unauthorized copying of this. file, via any medium is strictly prohibited\
* Proprietary and confidential\
*/"

HEADER_WITH_SPDX = "/* mbed Microcontroller Library\
* Copyright (c) 2006-2013 ARM Limited\
*\
* SPDX-License-Identifier: Apache-2.0\
* Licensed under the Apache License, Version 2.0 (the \"License\");\
* you may not use this file except in compliance with the License.\
* You may obtain a copy of the License at\
*\
* http://www.apache.org/licenses/LICENSE-2.0\
*\
* Unless required by applicable law or agreed to in writing, software\
* distributed under the License is distributed on an \"AS IS\" BASIS,\
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\
* See the License for the specific language governing permissions and\
* limitations under the License.\
*/"

@pytest.fixture()
def create_scanned_files():
"""Create stub files.
test3.h missing license notice
test4.h with license notice
test5.h with license notice
"""
file_paths = [
os.path.join(STUBS_PATH, "test3.h"),
os.path.join(STUBS_PATH, "test4.h"),
os.path.join(STUBS_PATH, "test5.h")
]
for file_path in file_paths:
with open(file_path, "w") as new_file:
if file_path in [os.path.join(STUBS_PATH, "test3.h")]:
new_file.write(HEADER_WITHOUT_SPDX)
else:
new_file.write(HEADER_WITH_SPDX)
yield
for file_path in file_paths:
os.remove(file_path)


class TestScancodeEvaluate:

def test_missing_files_attribute(self):
""" Missing `files` attribute in JSON.
@inputs scancode_test/scancode_test_1.json
@outputs -1
"""
assert license_check(os.path.join(STUBS_PATH, "scancode_test_1.json")) == -1

def test_various_combinations_permissive_license_with_spdx(self):
""" Various combinations where at least one license in
a file is permissive and has spdx in the match.identifier
attribute.
@inputs scancode_test/scancode_test_2.json
@outputs 0
"""
assert license_check(os.path.join(STUBS_PATH, "scancode_test_2.json")) == 0

def test_missing_license_permissive_license_and_spdx(self, create_scanned_files):
""" Test four files scanned with various issues.
test.h: Missing license text (error count += 1)
test3.h: Missing `Permissive` license text and `spdx` in match.identifier and not in file tested by ScanCode (error count += 2)
test4.h: Missing `Permissive` license text and `spdx` in match.identifier but found in file tested by ScanCode (error count += 1)
test5.h: Missing `spdx` in match.identifier but found in file tested by ScanCode. (error count += 0)
@inputs scancode_test/scancode_test_2.json
@output 4
"""
assert license_check(os.path.join(STUBS_PATH, "scancode_test_3.json")) == 4

def test_permissive_license_no_spdx(self, create_scanned_files):
""" Multiple `Permissive` licenses in one file but none with `spdx` in
match.identifier and not in file tested by ScanCode (error count += 1)
@inputs scancode_test/scancode_test_2.json
@outputs 1
"""
assert license_check(os.path.join(STUBS_PATH, "scancode_test_4.json")) == 1
7 changes: 7 additions & 0 deletions tools/test/travis-ci/scancode_test/scancode_test_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"headers": [
{
"tool_name": "scancode test fail"
}
]
}
Loading