Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,66 @@ License Clarity Scoring Update
- Scoring Weight = -20


License Clarity Scoring Update
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- We are moving away from the license clarity scoring defined by ClearlyDefined
in the license clarity score plugin. The previous license clarity scoring
logic produced a score that was misleading, where it would return a low score
when scanning packages due to the stringent scoring criteria. We are now
using more general criteria to get a sense of what provenance information has
been provided and whether or not there is a conflict in licensing between
what licenses were declared at the top-level key files and what licenses have
been detected in the files under the top-level.

- The license clarity score is a value from 0-100 calculated by combining the
weighted values determined for each of the scoring elements:

- Declared license:

- When true, indicates that the software package licensing is documented at
top-level or well-known locations in the software project, typically in a
package manifest, NOTICE, LICENSE, COPYING or README file.
- Scoring Weight = 40

- Identification precision:

- Indicates how well the license statement(s) of the software identify known
licenses that can be designated by precise keys (identifiers) as provided in
a publicly available license list, such as the ScanCode LicenseDB, the SPDX
license list, the OSI license list, or a URL pointing to a specific license
text in a project or organization website.
- Scoring Weight = 40

- License texts:

- License texts are provided to support the declared license expression in
files such as a package manifest, NOTICE, LICENSE, COPYING or README.
- Scoring Weight = 10

- Declared copyright:

- When true, indicates that the software package copyright is documented at
top-level or well-known locations in the software project, typically in a
package manifest, NOTICE, LICENSE, COPYING or README file.
- Scoring Weight = 10

- Ambiguous compound licensing:

- When true, indicates that the software has a license declaration that
makes it difficult to construct a reliable license expression, such as in
the case of multiple licenses where the conjunctive versus disjunctive
relationship is not well defined.
- Scoring Weight = -10

- Conflicting license categories:

- When true, indicates the declared license expression of the software is in
the permissive category, but that other potentially conflicting categories,
such as copyleft and proprietary, have been detected in lower level code.
- Scoring Weight = -20


Outputs:
~~~~~~~~

Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ scancode_scan =
# module for details and doc.
scancode_post_scan =
summary = summarycode.summarizer:ScanSummary
summary2 = summarycode.summarizer2:ScanSummary
summary-keeping-details = summarycode.summarizer:ScanSummaryWithDetails
summary-key-files = summarycode.summarizer:ScanKeyFilesSummary
summary-by-facet = summarycode.summarizer:ScanByFacetSummary
Expand Down
5 changes: 1 addition & 4 deletions src/summarycode/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,9 @@ class FileClassifier(PreScanPlugin):
]

def is_enabled(self, classify, **kwargs):
return classify
return True

def process_codebase(self, codebase, classify, **kwargs):
if not classify:
return

# find the real root directory
real_root = codebase.lowest_common_parent()
if not real_root:
Expand Down
58 changes: 29 additions & 29 deletions src/summarycode/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,9 @@ class LicenseClarityScore(PostScanPlugin):
"""
Compute a License clarity score at the codebase level.
"""
codebase_attributes = dict(license_clarity_score=Mapping(
help='Computed license clarity score as mapping containing the score '
'proper and each scoring elements.'))
codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))

sort_order = 110
sort_order = 5

options = [
PluggableCommandLineOption(('--license-clarity-score',),
Expand All @@ -66,14 +64,13 @@ class LicenseClarityScore(PostScanPlugin):
)
]

def is_enabled(self, license_clarity_score, **kwargs):
return license_clarity_score
def is_enabled(self, license_clarity_score, summary2, **kwargs):
return license_clarity_score or summary2

def process_codebase(self, codebase, license_clarity_score, **kwargs):
if TRACE:
logger_debug('LicenseClarityScore:process_codebase')
scoring_elements = compute_license_score(codebase)
codebase.attributes.license_clarity_score.update(scoring_elements)
compute_license_score(codebase)


def compute_license_score(codebase):
Expand Down Expand Up @@ -126,6 +123,7 @@ def compute_license_score(codebase):
scoring_elements = ScoringElements()
declared_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=True)
declared_license_expressions = get_field_values_from_codebase_resources(codebase, 'license_expressions', key_files_only=True)
unique_declared_license_expressions = list(set(declared_license_expressions))
declared_license_categories = get_license_categories(declared_licenses)
copyrights = get_field_values_from_codebase_resources(codebase, 'copyrights', key_files_only=True)
other_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=False)
Expand Down Expand Up @@ -155,14 +153,17 @@ def compute_license_score(codebase):
):
scoring_elements.score -= 20

scoring_elements.ambigous_compound_licensing = check_for_license_ambiguity(declared_license_expressions)
primary_license = get_primary_license(declared_license_expressions)
if (
scoring_elements.ambigous_compound_licensing
not primary_license
and scoring_elements.score > 0
):
scoring_elements.ambigous_compound_licensing = True
scoring_elements.score -= 10

return scoring_elements.to_dict()
codebase.attributes.summary['primary_license_expression'] = primary_license
codebase.attributes.summary['declared_license_expressions'] = unique_declared_license_expressions
codebase.attributes.summary['license_clarity_score'] = scoring_elements.to_dict()


@attr.s()
Expand Down Expand Up @@ -378,11 +379,10 @@ def group_license_expressions(unique_license_expressions):
unique_joined_expressions = []
seen_joined_expression = []
len_joined_expressions = len(joined_expressions)
for i, j in enumerate(joined_expressions):
starting_index = i + 1
if starting_index > len_joined_expressions:
for i, j in enumerate(joined_expressions, start=1):
if i > len_joined_expressions:
break
for j1 in joined_expressions[starting_index:]:
for j1 in joined_expressions[i:]:
if licensing.is_equivalent(j, j1):
if (
j not in unique_joined_expressions
Expand All @@ -394,21 +394,21 @@ def group_license_expressions(unique_license_expressions):
return unique_joined_expressions, single_expressions


def check_for_license_ambiguity(declared_license_expressions):
def get_primary_license(declared_license_expressions):
"""
License ambiguity is the situation where there is a license declaration that makes
it difficult to construct a reliable license expression, such as in the case
of multiple licenses where the conjunctive versus disjunctive relationship
is not well defined.
Return a primary license expression string from
`declared_license_expressions` or an empty string if a primary license
expression cannot be determined.

We determine if a list of `declared_license_expressions` has license ambiguity if
we cannot resolve the `declared_license_expressions` into one expression.
We determine if a list of `declared_license_expressions` has a primary
license if we can resolve the `declared_license_expressions` into one
expression.
"""
unique_declared_license_expressions = set(declared_license_expressions)
unique_declared_license_expressions = list(set(declared_license_expressions))
# If we only have a single unique license expression, then we do not have
# any ambiguity about the licensing
if len(unique_declared_license_expressions) == 1:
return False
return unique_declared_license_expressions[0]

unique_joined_expressions, single_expressions = group_license_expressions(
unique_declared_license_expressions
Expand All @@ -417,10 +417,10 @@ def check_for_license_ambiguity(declared_license_expressions):
if not unique_joined_expressions:
# If we do not have any joined expressions, but multiple single
# expressions remaining, then we have license ambiguity
if len(single_expressions) > 1:
return True
if len(single_expressions) == 1:
return single_expressions[0]
else:
return False
return ''

# Group single expressions to joined expressions to see if single
# expressions are accounted for in a joined expression
Expand All @@ -442,6 +442,6 @@ def check_for_license_ambiguity(declared_license_expressions):
# that have not been associated with a joined license expression, then we do
# not have any ambiguity about the license
if len(single_expressions_by_joined_expressions) == 1 and not not_in_joined_expressions:
return False
return next(iter(single_expressions_by_joined_expressions))
else:
return True
return ''
153 changes: 153 additions & 0 deletions src/summarycode/summarizer2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# ScanCode is a trademark of nexB Inc.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/scancode-toolkit for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#

from collections import Counter

import attr

from plugincode.post_scan import PostScanPlugin
from plugincode.post_scan import post_scan_impl
from commoncode.cliutils import PluggableCommandLineOption
from commoncode.cliutils import POST_SCAN_GROUP
from summarycode.utils import sorted_counter
from summarycode.utils import get_resource_summary
from summarycode.utils import set_resource_summary

# Tracing flags
TRACE = False
TRACE_LIGHT = False


def logger_debug(*args):
pass


if TRACE or TRACE_LIGHT:
import logging
import sys

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout)
logger.setLevel(logging.DEBUG)

def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

"""
Create summarized scan data.
"""


@post_scan_impl
class ScanSummary(PostScanPlugin):
"""
Summarize a scan at the codebase level.
"""
sort_order = 10

codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))

options = [
PluggableCommandLineOption(('--summary2',),
is_flag=True, default=False,
help='Summarize license, copyright and other scans at the codebase level.',
help_group=POST_SCAN_GROUP)
]

def is_enabled(self, summary2, **kwargs):
return summary2

def process_codebase(self, codebase, summary2, **kwargs):
if TRACE_LIGHT: logger_debug('ScanSummary:process_codebase')
summarize_codebase(codebase, keep_details=False, **kwargs)



def summarize_codebase(codebase, keep_details, **kwargs):
"""
Summarize a scan at the codebase level for available scans.

If `keep_details` is True, also keep file and directory details in the
`summary` file attribute for every file and directory.
"""
from summarycode.copyright_summary import holder_summarizer

attrib_summarizers = [
('license_expressions', license_summarizer),
('holders', holder_summarizer),
]

# find which attributes are available for summarization by checking the root
# resource
root = codebase.root
summarizers = [s for a, s in attrib_summarizers if hasattr(root, a)]
if TRACE: logger_debug('summarize_codebase with summarizers:', summarizers)

# collect and set resource-level summaries
for resource in codebase.walk(topdown=False):
children = resource.children(codebase)

for summarizer in summarizers:
_summary_data = summarizer(resource, children, keep_details=keep_details)
if TRACE: logger_debug('summary for:', resource.path, 'after summarizer:', summarizer, 'is:', _summary_data)

codebase.save_resource(resource)

# set the summary from the root resource at the codebase level
if keep_details:
summary = root.summary
else:
summary = root.extra_data.get('summary', {})
codebase.attributes.summary.update(summary)

if TRACE: logger_debug('codebase summary:', summary)


def license_summarizer(resource, children, keep_details=False):
"""
Populate a license_expressions list of mappings such as
{value: "expression", count: "count of occurences"}
sorted by decreasing count.
"""
LIC_EXP = 'license_expressions'
license_expressions = []

# Collect current data
lic_expressions = getattr(resource, LIC_EXP , [])
if not lic_expressions and resource.is_file:
# also count files with no detection
license_expressions.append(None)
else:
license_expressions.extend(lic_expressions)

# Collect direct children expression summary
for child in children:
child_summaries = get_resource_summary(child, key=LIC_EXP, as_attribute=keep_details) or []
for child_summary in child_summaries:
# TODO: review this: this feels rather weird
child_sum_val = child_summary.get('value')
if child_sum_val:
values = [child_sum_val] * child_summary['count']
license_expressions.extend(values)

# summarize proper
licenses_counter = summarize_licenses(license_expressions)
summarized = sorted_counter(licenses_counter)
set_resource_summary(resource, key=LIC_EXP, value=summarized, as_attribute=keep_details)
return summarized


def summarize_licenses(license_expressions):
"""
Given a list of license expressions, return a mapping of {expression: count
of occurences}
"""
# TODO: we could normalize and/or sort each license_expression before
# summarization and consider other equivalence or containment checks
return Counter(license_expressions)