Skip to content

Commit f6e357f

Browse files
committed
Merge remote-tracking branch 'upstream/2842-primary-license-in-summary' into release-31
2 parents 15cbff3 + 0bbad82 commit f6e357f

File tree

5 files changed

+244
-33
lines changed

5 files changed

+244
-33
lines changed

CHANGELOG.rst

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,66 @@ License Clarity Scoring Update
222222
- Scoring Weight = -20
223223

224224

225+
License Clarity Scoring Update
226+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
227+
228+
- We are moving away from the license clarity scoring defined by ClearlyDefined
229+
in the license clarity score plugin. The previous license clarity scoring
230+
logic produced a score that was misleading, where it would return a low score
231+
when scanning packages due to the stringent scoring criteria. We are now
232+
using more general criteria to get a sense of what provenance information has
233+
been provided and whether or not there is a conflict in licensing between
234+
what licenses were declared at the top-level key files and what licenses have
235+
been detected in the files under the top-level.
236+
237+
- The license clarity score is a value from 0-100 calculated by combining the
238+
weighted values determined for each of the scoring elements:
239+
240+
- Declared license:
241+
242+
- When true, indicates that the software package licensing is documented at
243+
top-level or well-known locations in the software project, typically in a
244+
package manifest, NOTICE, LICENSE, COPYING or README file.
245+
- Scoring Weight = 40
246+
247+
- Identification precision:
248+
249+
- Indicates how well the license statement(s) of the software identify known
250+
licenses that can be designated by precise keys (identifiers) as provided in
251+
a publicly available license list, such as the ScanCode LicenseDB, the SPDX
252+
license list, the OSI license list, or a URL pointing to a specific license
253+
text in a project or organization website.
254+
- Scoring Weight = 40
255+
256+
- License texts:
257+
258+
- License texts are provided to support the declared license expression in
259+
files such as a package manifest, NOTICE, LICENSE, COPYING or README.
260+
- Scoring Weight = 10
261+
262+
- Declared copyright:
263+
264+
- When true, indicates that the software package copyright is documented at
265+
top-level or well-known locations in the software project, typically in a
266+
package manifest, NOTICE, LICENSE, COPYING or README file.
267+
- Scoring Weight = 10
268+
269+
- Ambiguous compound licensing:
270+
271+
- When true, indicates that the software has a license declaration that
272+
makes it difficult to construct a reliable license expression, such as in
273+
the case of multiple licenses where the conjunctive versus disjunctive
274+
relationship is not well defined.
275+
- Scoring Weight = -10
276+
277+
- Conflicting license categories:
278+
279+
- When true, indicates the declared license expression of the software is in
280+
the permissive category, but that other potentially conflicting categories,
281+
such as copyleft and proprietary, have been detected in lower level code.
282+
- Scoring Weight = -20
283+
284+
225285
Outputs:
226286
~~~~~~~~
227287

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ scancode_scan =
176176
# module for details and doc.
177177
scancode_post_scan =
178178
summary = summarycode.summarizer:ScanSummary
179+
summary2 = summarycode.summarizer2:ScanSummary
179180
summary-keeping-details = summarycode.summarizer:ScanSummaryWithDetails
180181
summary-key-files = summarycode.summarizer:ScanKeyFilesSummary
181182
summary-by-facet = summarycode.summarizer:ScanByFacetSummary

src/summarycode/classify.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -110,12 +110,9 @@ class FileClassifier(PreScanPlugin):
110110
]
111111

112112
def is_enabled(self, classify, **kwargs):
113-
return classify
113+
return True
114114

115115
def process_codebase(self, codebase, classify, **kwargs):
116-
if not classify:
117-
return
118-
119116
# find the real root directory
120117
real_root = codebase.lowest_common_parent()
121118
if not real_root:

src/summarycode/score.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -48,11 +48,9 @@ class LicenseClarityScore(PostScanPlugin):
4848
"""
4949
Compute a License clarity score at the codebase level.
5050
"""
51-
codebase_attributes = dict(license_clarity_score=Mapping(
52-
help='Computed license clarity score as mapping containing the score '
53-
'proper and each scoring elements.'))
51+
codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))
5452

55-
sort_order = 110
53+
sort_order = 5
5654

5755
options = [
5856
PluggableCommandLineOption(('--license-clarity-score',),
@@ -66,14 +64,13 @@ class LicenseClarityScore(PostScanPlugin):
6664
)
6765
]
6866

69-
def is_enabled(self, license_clarity_score, **kwargs):
70-
return license_clarity_score
67+
def is_enabled(self, license_clarity_score, summary2, **kwargs):
68+
return license_clarity_score or summary2
7169

7270
def process_codebase(self, codebase, license_clarity_score, **kwargs):
7371
if TRACE:
7472
logger_debug('LicenseClarityScore:process_codebase')
75-
scoring_elements = compute_license_score(codebase)
76-
codebase.attributes.license_clarity_score.update(scoring_elements)
73+
compute_license_score(codebase)
7774

7875

7976
def compute_license_score(codebase):
@@ -126,6 +123,7 @@ def compute_license_score(codebase):
126123
scoring_elements = ScoringElements()
127124
declared_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=True)
128125
declared_license_expressions = get_field_values_from_codebase_resources(codebase, 'license_expressions', key_files_only=True)
126+
unique_declared_license_expressions = list(set(declared_license_expressions))
129127
declared_license_categories = get_license_categories(declared_licenses)
130128
copyrights = get_field_values_from_codebase_resources(codebase, 'copyrights', key_files_only=True)
131129
other_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=False)
@@ -155,14 +153,17 @@ def compute_license_score(codebase):
155153
):
156154
scoring_elements.score -= 20
157155

158-
scoring_elements.ambigous_compound_licensing = check_for_license_ambiguity(declared_license_expressions)
156+
primary_license = get_primary_license(declared_license_expressions)
159157
if (
160-
scoring_elements.ambigous_compound_licensing
158+
not primary_license
161159
and scoring_elements.score > 0
162160
):
161+
scoring_elements.ambigous_compound_licensing = True
163162
scoring_elements.score -= 10
164163

165-
return scoring_elements.to_dict()
164+
codebase.attributes.summary['primary_license_expression'] = primary_license
165+
codebase.attributes.summary['declared_license_expressions'] = unique_declared_license_expressions
166+
codebase.attributes.summary['license_clarity_score'] = scoring_elements.to_dict()
166167

167168

168169
@attr.s()
@@ -378,11 +379,10 @@ def group_license_expressions(unique_license_expressions):
378379
unique_joined_expressions = []
379380
seen_joined_expression = []
380381
len_joined_expressions = len(joined_expressions)
381-
for i, j in enumerate(joined_expressions):
382-
starting_index = i + 1
383-
if starting_index > len_joined_expressions:
382+
for i, j in enumerate(joined_expressions, start=1):
383+
if i > len_joined_expressions:
384384
break
385-
for j1 in joined_expressions[starting_index:]:
385+
for j1 in joined_expressions[i:]:
386386
if licensing.is_equivalent(j, j1):
387387
if (
388388
j not in unique_joined_expressions
@@ -394,21 +394,21 @@ def group_license_expressions(unique_license_expressions):
394394
return unique_joined_expressions, single_expressions
395395

396396

397-
def check_for_license_ambiguity(declared_license_expressions):
397+
def get_primary_license(declared_license_expressions):
398398
"""
399-
License ambiguity is the situation where there is a license declaration that makes
400-
it difficult to construct a reliable license expression, such as in the case
401-
of multiple licenses where the conjunctive versus disjunctive relationship
402-
is not well defined.
399+
Return a primary license expression string from
400+
`declared_license_expressions` or an empty string if a primary license
401+
expression cannot be determined.
403402
404-
We determine if a list of `declared_license_expressions` has license ambiguity if
405-
we cannot resolve the `declared_license_expressions` into one expression.
403+
We determine if a list of `declared_license_expressions` has a primary
404+
license if we can resolve the `declared_license_expressions` into one
405+
expression.
406406
"""
407-
unique_declared_license_expressions = set(declared_license_expressions)
407+
unique_declared_license_expressions = list(set(declared_license_expressions))
408408
# If we only have a single unique license expression, then we do not have
409409
# any ambiguity about the licensing
410410
if len(unique_declared_license_expressions) == 1:
411-
return False
411+
return unique_declared_license_expressions[0]
412412

413413
unique_joined_expressions, single_expressions = group_license_expressions(
414414
unique_declared_license_expressions
@@ -417,10 +417,10 @@ def check_for_license_ambiguity(declared_license_expressions):
417417
if not unique_joined_expressions:
418418
# If we do not have any joined expressions, but multiple single
419419
# expressions remaining, then we have license ambiguity
420-
if len(single_expressions) > 1:
421-
return True
420+
if len(single_expressions) == 1:
421+
return single_expressions[0]
422422
else:
423-
return False
423+
return ''
424424

425425
# Group single expressions to joined expressions to see if single
426426
# expressions are accounted for in a joined expression
@@ -442,6 +442,6 @@ def check_for_license_ambiguity(declared_license_expressions):
442442
# that have not been associated with a joined license expression, then we do
443443
# not have any ambiguity about the license
444444
if len(single_expressions_by_joined_expressions) == 1 and not not_in_joined_expressions:
445-
return False
445+
return next(iter(single_expressions_by_joined_expressions))
446446
else:
447-
return True
447+
return ''

src/summarycode/summarizer2.py

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# ScanCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/nexB/scancode-toolkit for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
from collections import Counter
11+
12+
import attr
13+
14+
from plugincode.post_scan import PostScanPlugin
15+
from plugincode.post_scan import post_scan_impl
16+
from commoncode.cliutils import PluggableCommandLineOption
17+
from commoncode.cliutils import POST_SCAN_GROUP
18+
from summarycode.utils import sorted_counter
19+
from summarycode.utils import get_resource_summary
20+
from summarycode.utils import set_resource_summary
21+
22+
# Tracing flags
23+
TRACE = False
24+
TRACE_LIGHT = False
25+
26+
27+
def logger_debug(*args):
28+
pass
29+
30+
31+
if TRACE or TRACE_LIGHT:
32+
import logging
33+
import sys
34+
35+
logger = logging.getLogger(__name__)
36+
logging.basicConfig(stream=sys.stdout)
37+
logger.setLevel(logging.DEBUG)
38+
39+
def logger_debug(*args):
40+
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
41+
42+
"""
43+
Create summarized scan data.
44+
"""
45+
46+
47+
@post_scan_impl
48+
class ScanSummary(PostScanPlugin):
49+
"""
50+
Summarize a scan at the codebase level.
51+
"""
52+
sort_order = 10
53+
54+
codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))
55+
56+
options = [
57+
PluggableCommandLineOption(('--summary2',),
58+
is_flag=True, default=False,
59+
help='Summarize license, copyright and other scans at the codebase level.',
60+
help_group=POST_SCAN_GROUP)
61+
]
62+
63+
def is_enabled(self, summary2, **kwargs):
64+
return summary2
65+
66+
def process_codebase(self, codebase, summary2, **kwargs):
67+
if TRACE_LIGHT: logger_debug('ScanSummary:process_codebase')
68+
summarize_codebase(codebase, keep_details=False, **kwargs)
69+
70+
71+
72+
def summarize_codebase(codebase, keep_details, **kwargs):
73+
"""
74+
Summarize a scan at the codebase level for available scans.
75+
76+
If `keep_details` is True, also keep file and directory details in the
77+
`summary` file attribute for every file and directory.
78+
"""
79+
from summarycode.copyright_summary import holder_summarizer
80+
81+
attrib_summarizers = [
82+
('license_expressions', license_summarizer),
83+
('holders', holder_summarizer),
84+
]
85+
86+
# find which attributes are available for summarization by checking the root
87+
# resource
88+
root = codebase.root
89+
summarizers = [s for a, s in attrib_summarizers if hasattr(root, a)]
90+
if TRACE: logger_debug('summarize_codebase with summarizers:', summarizers)
91+
92+
# collect and set resource-level summaries
93+
for resource in codebase.walk(topdown=False):
94+
children = resource.children(codebase)
95+
96+
for summarizer in summarizers:
97+
_summary_data = summarizer(resource, children, keep_details=keep_details)
98+
if TRACE: logger_debug('summary for:', resource.path, 'after summarizer:', summarizer, 'is:', _summary_data)
99+
100+
codebase.save_resource(resource)
101+
102+
# set the summary from the root resource at the codebase level
103+
if keep_details:
104+
summary = root.summary
105+
else:
106+
summary = root.extra_data.get('summary', {})
107+
codebase.attributes.summary.update(summary)
108+
109+
if TRACE: logger_debug('codebase summary:', summary)
110+
111+
112+
def license_summarizer(resource, children, keep_details=False):
113+
"""
114+
Populate a license_expressions list of mappings such as
115+
{value: "expression", count: "count of occurences"}
116+
sorted by decreasing count.
117+
"""
118+
LIC_EXP = 'license_expressions'
119+
license_expressions = []
120+
121+
# Collect current data
122+
lic_expressions = getattr(resource, LIC_EXP , [])
123+
if not lic_expressions and resource.is_file:
124+
# also count files with no detection
125+
license_expressions.append(None)
126+
else:
127+
license_expressions.extend(lic_expressions)
128+
129+
# Collect direct children expression summary
130+
for child in children:
131+
child_summaries = get_resource_summary(child, key=LIC_EXP, as_attribute=keep_details) or []
132+
for child_summary in child_summaries:
133+
# TODO: review this: this feels rather weird
134+
child_sum_val = child_summary.get('value')
135+
if child_sum_val:
136+
values = [child_sum_val] * child_summary['count']
137+
license_expressions.extend(values)
138+
139+
# summarize proper
140+
licenses_counter = summarize_licenses(license_expressions)
141+
summarized = sorted_counter(licenses_counter)
142+
set_resource_summary(resource, key=LIC_EXP, value=summarized, as_attribute=keep_details)
143+
return summarized
144+
145+
146+
def summarize_licenses(license_expressions):
147+
"""
148+
Given a list of license expressions, return a mapping of {expression: count
149+
of occurences}
150+
"""
151+
# TODO: we could normalize and/or sort each license_expression before
152+
# summarization and consider other equivalence or containment checks
153+
return Counter(license_expressions)

0 commit comments

Comments
 (0)