aboutcode-org · pombredanne · Mar 14, 2022 · Mar 3, 2022 · Mar 4, 2022 · Mar 4, 2022
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -231,6 +231,66 @@ License Clarity Scoring Update
      - Scoring Weight = -20
 
 
+License Clarity Scoring Update
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ - We are moving away from the license clarity scoring defined by ClearlyDefined
+   in the license clarity score plugin. The previous license clarity scoring
+   logic produced a score that was misleading, where it would return a low score
+   when scanning packages due to the stringent scoring criteria. We are now
+   using more general criteria to get a sense of what provenance information has
+   been provided and whether or not there is a conflict in licensing between
+   what licenses were declared at the top-level key files and what licenses have
+   been detected in the files under the top-level.
+
+ - The license clarity score is a value from 0-100 calculated by combining the
+   weighted values determined for each of the scoring elements:
+
+   - Declared license:
+
+     - When true, indicates that the software package licensing is documented at
+       top-level or well-known locations in the software project, typically in a
+       package manifest, NOTICE, LICENSE, COPYING or README file.
+     - Scoring Weight = 40
+
+   - Identification precision:
+
+     - Indicates how well the license statement(s) of the software identify known
+       licenses that can be designated by precise keys (identifiers) as provided in
+       a publicly available license list, such as the ScanCode LicenseDB, the SPDX
+       license list, the OSI license list, or a URL pointing to a specific license
+       text in a project or organization website.
+     - Scoring Weight = 40
+
+   - License texts:
+
+     - License texts are provided to support the declared license expression in
+       files such as a package manifest, NOTICE, LICENSE, COPYING or README.
+     - Scoring Weight = 10
+
+   - Declared copyright:
+
+     - When true, indicates that the software package copyright is documented at
+       top-level or well-known locations in the software project, typically in a
+       package manifest, NOTICE, LICENSE, COPYING or README file.
+     - Scoring Weight = 10
+
+   - Ambiguous compound licensing:
+
+     - When true, indicates that the software has a license declaration that
+       makes it difficult to construct a reliable license expression, such as in
+       the case of multiple licenses where the conjunctive versus disjunctive
+       relationship is not well defined.
+     - Scoring Weight = -10
+
+   - Conflicting license categories:
+
+     - When true, indicates the declared license expression of the software is in
+       the permissive category, but that other potentially conflicting categories,
+       such as copyleft and proprietary, have been detected in lower level code.
+     - Scoring Weight = -20
+
+
 Outputs:
 ~~~~~~~~
 

diff --git a/setup.cfg b/setup.cfg
@@ -166,6 +166,7 @@ scancode_scan =
 # module for details and doc.
 scancode_post_scan =
     summary = summarycode.summarizer:ScanSummary
+    summary2 = summarycode.summarizer2:ScanSummary
     summary-keeping-details = summarycode.summarizer:ScanSummaryWithDetails
     summary-key-files = summarycode.summarizer:ScanKeyFilesSummary
     summary-by-facet = summarycode.summarizer:ScanByFacetSummary

diff --git a/src/summarycode/classify.py b/src/summarycode/classify.py
@@ -110,12 +110,9 @@ class FileClassifier(PreScanPlugin):
     ]
 
     def is_enabled(self, classify, **kwargs):
-        return classify
+        return True
 
     def process_codebase(self, codebase, classify, **kwargs):
-        if not classify:
-            return
-
         # find the real root directory
         real_root = codebase.lowest_common_parent()
         if not real_root:

diff --git a/src/summarycode/score.py b/src/summarycode/score.py
@@ -48,11 +48,9 @@ class LicenseClarityScore(PostScanPlugin):
     """
     Compute a License clarity score at the codebase level.
     """
-    codebase_attributes = dict(license_clarity_score=Mapping(
-        help='Computed license clarity score as mapping containing the score '
-             'proper and each scoring elements.'))
+    codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))
 
-    sort_order = 110
+    sort_order = 5
 
     options = [
         PluggableCommandLineOption(('--license-clarity-score',),
@@ -66,14 +64,13 @@ class LicenseClarityScore(PostScanPlugin):
         )
     ]
 
-    def is_enabled(self, license_clarity_score, **kwargs):
-        return license_clarity_score
+    def is_enabled(self, license_clarity_score, summary2, **kwargs):
+        return license_clarity_score or summary2
 
     def process_codebase(self, codebase, license_clarity_score, **kwargs):
         if TRACE:
             logger_debug('LicenseClarityScore:process_codebase')
-        scoring_elements = compute_license_score(codebase)
-        codebase.attributes.license_clarity_score.update(scoring_elements)
+        compute_license_score(codebase)
 
 
 def compute_license_score(codebase):
@@ -126,6 +123,7 @@ def compute_license_score(codebase):
     scoring_elements = ScoringElements()
     declared_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=True)
     declared_license_expressions = get_field_values_from_codebase_resources(codebase, 'license_expressions', key_files_only=True)
+    unique_declared_license_expressions = list(set(declared_license_expressions))
     declared_license_categories = get_license_categories(declared_licenses)
     copyrights = get_field_values_from_codebase_resources(codebase, 'copyrights', key_files_only=True)
     other_licenses = get_field_values_from_codebase_resources(codebase, 'licenses', key_files_only=False)
@@ -155,14 +153,17 @@ def compute_license_score(codebase):
         ):
             scoring_elements.score -= 20
 
-    scoring_elements.ambigous_compound_licensing = check_for_license_ambiguity(declared_license_expressions)
+    primary_license = get_primary_license(declared_license_expressions)
     if (
-        scoring_elements.ambigous_compound_licensing
+        not primary_license
         and scoring_elements.score > 0
     ):
+        scoring_elements.ambigous_compound_licensing = True
         scoring_elements.score -= 10
 
-    return scoring_elements.to_dict()
+    codebase.attributes.summary['primary_license_expression'] = primary_license
+    codebase.attributes.summary['declared_license_expressions'] = unique_declared_license_expressions
+    codebase.attributes.summary['license_clarity_score'] = scoring_elements.to_dict()
 
 
 @attr.s()
@@ -378,11 +379,10 @@ def group_license_expressions(unique_license_expressions):
     unique_joined_expressions = []
     seen_joined_expression = []
     len_joined_expressions = len(joined_expressions)
-    for i, j in enumerate(joined_expressions):
-        starting_index = i + 1
-        if starting_index > len_joined_expressions:
+    for i, j in enumerate(joined_expressions, start=1):
+        if i > len_joined_expressions:
             break
-        for j1 in joined_expressions[starting_index:]:
+        for j1 in joined_expressions[i:]:
             if licensing.is_equivalent(j, j1):
                 if (
                     j not in unique_joined_expressions
@@ -394,21 +394,21 @@ def group_license_expressions(unique_license_expressions):
     return unique_joined_expressions, single_expressions
 
 
-def check_for_license_ambiguity(declared_license_expressions):
+def get_primary_license(declared_license_expressions):
     """
-    License ambiguity is the situation where there is a license declaration that makes
-    it difficult to construct a reliable license expression, such as in the case
-    of multiple licenses where the conjunctive versus disjunctive relationship
-    is not well defined.
+    Return a primary license expression string from
+    `declared_license_expressions` or an empty string if a primary license
+    expression cannot be determined.
 
-    We determine if a list of `declared_license_expressions` has license ambiguity if
-    we cannot resolve the `declared_license_expressions` into one expression.
+    We determine if a list of `declared_license_expressions` has a primary
+    license if we can resolve the `declared_license_expressions` into one
+    expression.
     """
-    unique_declared_license_expressions = set(declared_license_expressions)
+    unique_declared_license_expressions = list(set(declared_license_expressions))
     # If we only have a single unique license expression, then we do not have
     # any ambiguity about the licensing
     if len(unique_declared_license_expressions) == 1:
-        return False
+        return unique_declared_license_expressions[0]
 
     unique_joined_expressions, single_expressions = group_license_expressions(
         unique_declared_license_expressions
@@ -417,10 +417,10 @@ def check_for_license_ambiguity(declared_license_expressions):
     if not unique_joined_expressions:
         # If we do not have any joined expressions, but multiple single
         # expressions remaining, then we have license ambiguity
-        if len(single_expressions) > 1:
-            return True
+        if len(single_expressions) == 1:
+            return single_expressions[0]
         else:
-            return False
+            return ''
 
     # Group single expressions to joined expressions to see if single
     # expressions are accounted for in a joined expression
@@ -442,6 +442,6 @@ def check_for_license_ambiguity(declared_license_expressions):
     # that have not been associated with a joined license expression, then we do
     # not have any ambiguity about the license
     if len(single_expressions_by_joined_expressions) == 1 and not not_in_joined_expressions:
-        return False
+        return next(iter(single_expressions_by_joined_expressions))
     else:
-        return True
+        return ''
diff --git a/src/summarycode/summarizer2.py b/src/summarycode/summarizer2.py
@@ -0,0 +1,153 @@
+#
+# Copyright (c) nexB Inc. and others. All rights reserved.
+# ScanCode is a trademark of nexB Inc.
+# SPDX-License-Identifier: Apache-2.0
+# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
+# See https://github.com/nexB/scancode-toolkit for support or download.
+# See https://aboutcode.org for more information about nexB OSS projects.
+#
+
+from collections import Counter
+
+import attr
+
+from plugincode.post_scan import PostScanPlugin
+from plugincode.post_scan import post_scan_impl
+from commoncode.cliutils import PluggableCommandLineOption
+from commoncode.cliutils import POST_SCAN_GROUP
+from summarycode.utils import sorted_counter
+from summarycode.utils import get_resource_summary
+from summarycode.utils import set_resource_summary
+
+# Tracing flags
+TRACE = False
+TRACE_LIGHT = False
+
+
+def logger_debug(*args):
+    pass
+
+
+if TRACE or TRACE_LIGHT:
+    import logging
+    import sys
+
+    logger = logging.getLogger(__name__)
+    logging.basicConfig(stream=sys.stdout)
+    logger.setLevel(logging.DEBUG)
+
+    def logger_debug(*args):
+        return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))
+
+"""
+Create summarized scan data.
+"""
+
+
+@post_scan_impl
+class ScanSummary(PostScanPlugin):
+    """
+    Summarize a scan at the codebase level.
+    """
+    sort_order = 10
+
+    codebase_attributes = dict(summary=attr.ib(default=attr.Factory(dict)))
+
+    options = [
+        PluggableCommandLineOption(('--summary2',),
+            is_flag=True, default=False,
+            help='Summarize license, copyright and other scans at the codebase level.',
+            help_group=POST_SCAN_GROUP)
+    ]
+
+    def is_enabled(self, summary2, **kwargs):
+        return summary2
+
+    def process_codebase(self, codebase, summary2, **kwargs):
+        if TRACE_LIGHT: logger_debug('ScanSummary:process_codebase')
+        summarize_codebase(codebase, keep_details=False, **kwargs)
+
+
+
+def summarize_codebase(codebase, keep_details, **kwargs):
+    """
+    Summarize a scan at the codebase level for available scans.
+
+    If `keep_details` is True, also keep file and directory details in the
+    `summary` file attribute for every file and directory.
+    """
+    from summarycode.copyright_summary import holder_summarizer
+
+    attrib_summarizers = [
+        ('license_expressions', license_summarizer),
+        ('holders', holder_summarizer),
+    ]
+
+    # find which attributes are available for summarization by checking the root
+    # resource
+    root = codebase.root
+    summarizers = [s for a, s in attrib_summarizers if hasattr(root, a)]
+    if TRACE: logger_debug('summarize_codebase with summarizers:', summarizers)
+
+    # collect and set resource-level summaries
+    for resource in codebase.walk(topdown=False):
+        children = resource.children(codebase)
+
+        for summarizer in summarizers:
+            _summary_data = summarizer(resource, children, keep_details=keep_details)
+            if TRACE: logger_debug('summary for:', resource.path, 'after summarizer:', summarizer, 'is:', _summary_data)
+
+        codebase.save_resource(resource)
+
+    # set the summary from the root resource at the codebase level
+    if keep_details:
+        summary = root.summary
+    else:
+        summary = root.extra_data.get('summary', {})
+    codebase.attributes.summary.update(summary)
+
+    if TRACE: logger_debug('codebase summary:', summary)
+
+
+def license_summarizer(resource, children, keep_details=False):
+    """
+    Populate a license_expressions list of mappings such as
+        {value: "expression", count: "count of occurences"}
+    sorted by decreasing count.
+    """
+    LIC_EXP = 'license_expressions'
+    license_expressions = []
+
+    # Collect current data
+    lic_expressions = getattr(resource, LIC_EXP  , [])
+    if not lic_expressions and resource.is_file:
+        # also count files with no detection
+        license_expressions.append(None)
+    else:
+        license_expressions.extend(lic_expressions)
+
+    # Collect direct children expression summary
+    for child in children:
+        child_summaries = get_resource_summary(child, key=LIC_EXP, as_attribute=keep_details) or []
+        for child_summary in child_summaries:
+            # TODO: review this: this feels rather weird
+            child_sum_val = child_summary.get('value')
+            if child_sum_val:
+                values = [child_sum_val] * child_summary['count']
+                license_expressions.extend(values)
+
+    # summarize proper
+    licenses_counter = summarize_licenses(license_expressions)
+    summarized = sorted_counter(licenses_counter)
+    set_resource_summary(resource, key=LIC_EXP, value=summarized, as_attribute=keep_details)
+    return summarized
+
+
+def summarize_licenses(license_expressions):
+    """
+    Given a list of license expressions, return a mapping of {expression: count
+    of occurences}
+    """
+    # TODO: we could normalize and/or sort each license_expression before
+    # summarization and consider other equivalence or containment checks
+    return Counter(license_expressions)