Skip to content

Commit dd2c3f7

Browse files
Merge pull request #3353 from nexB/todo-review
Ambiguous Detections ToDo items
2 parents 5d78022 + 259eb01 commit dd2c3f7

34 files changed

+2064
-117
lines changed

CHANGELOG.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ we changed some of the command line options.
2727
The output format version is now 3.0.0.
2828

2929

30+
Other updates
31+
~~~~~~~~~~~~~
32+
33+
- We have a new ``--todo`` CLI option to have todo items for reviewers
34+
with ambiguous license detections and package detections.
35+
This adds a new codebase-level attribute ``todo`` which is a list
36+
of ambiguous detections. Here the ambiguous detection attributes are:
37+
- detection_id: same license unique license detection identifier or top-level
38+
package identifier.
39+
- detection: this is either a license_detection or a package_data mapping.
40+
- review_comments: to explain the type of ambiguous detections.
3041

3142
Package detection:
3243
~~~~~~~~~~~~~~~~~~

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ scancode_post_scan =
196196
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
197197
consolidate = summarycode.plugin_consolidate:Consolidator
198198
license-references = licensedcode.licenses_reference:LicenseReference
199+
todo = summarycode.todo:AmbiguousDetectionsToDoPlugin
199200

200201

201202
# scancode_output_filter is the entry point for filter plugins executed after

src/licensedcode/detection.py

Lines changed: 189 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,9 @@ def logger_debug(*args):
7676
# Values of match_coverage less than this are reported as `license_clues` matches
7777
CLUES_MATCH_COVERAGE_THR = 60
7878

79+
# Low Relevance threshold
80+
LOW_RELEVANCE_THRESHOLD = 70
81+
7982
# False positives to spurious and gibberish texts are found usually later in the file
8083
# and matched to relatively short rules
8184
# Threshold Value of start line after which a match to likely be a false positive
@@ -104,6 +107,8 @@ class DetectionCategory(Enum):
104107
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
105108
FALSE_POSITVE = 'possible-false-positive'
106109
UNDETECTED_LICENSE = 'undetected-license'
110+
MATCH_FRAGMENTS = 'match-fragments'
111+
LOW_RELEVANCE = 'low-relevance'
107112

108113

109114
class DetectionRule(Enum):
@@ -141,6 +146,9 @@ class FileRegion:
141146
start_line = attr.ib(type=int)
142147
end_line = attr.ib(type=int)
143148

149+
def to_dict(self):
150+
return attr.asdict(self, dict_factory=dict)
151+
144152

145153
@attr.s(slots=True, eq=False, order=False)
146154
class LicenseDetection:
@@ -275,7 +283,10 @@ def _identifier(self):
275283
"""
276284
data = []
277285
for match in self.matches:
278-
tokenized_matched_text = tuple(query_tokenizer(match.matched_text()))
286+
if isinstance(match.matched_text, str):
287+
tokenized_matched_text = tuple(query_tokenizer(match.matched_text))
288+
else:
289+
tokenized_matched_text = tuple(query_tokenizer(match.matched_text()))
279290
identifier = (
280291
match.rule.identifier,
281292
match.score(),
@@ -613,6 +624,106 @@ def from_dicts(cls, license_match_mappings):
613624
"""
614625
return [LicenseMatchFromResult.from_dict(lmm) for lmm in license_match_mappings]
615626

627+
def to_dict(
628+
self,
629+
include_text=False,
630+
license_text_diagnostics=False,
631+
whole_lines=True,
632+
):
633+
"""
634+
Return a "result" scan data built from a LicenseMatch object.
635+
"""
636+
matched_text = None
637+
if include_text:
638+
matched_text = self.matched_text
639+
640+
result = {}
641+
642+
# Detection Level Information
643+
result['score'] = self.score()
644+
result['start_line'] = self.start_line
645+
result['end_line'] = self.end_line
646+
result['matched_length'] = self.len()
647+
result['match_coverage'] = self.coverage()
648+
result['matcher'] = self.matcher
649+
650+
# LicenseDB Level Information (Rule that was matched)
651+
result['license_expression'] = self.rule.license_expression
652+
result['rule_identifier'] = self.rule.identifier
653+
result['rule_relevance'] = self.rule.relevance
654+
result['rule_url'] = self.rule.rule_url
655+
656+
if include_text:
657+
result['matched_text'] = matched_text
658+
return result
659+
660+
661+
def collect_license_detections(codebase, include_license_clues=True):
662+
"""
663+
Return a list of LicenseDetectionFromResult from a ``codebase``
664+
"""
665+
has_packages = hasattr(codebase.root, 'package_data')
666+
has_licenses = hasattr(codebase.root, 'license_detections')
667+
668+
all_license_detections = []
669+
670+
for resource in codebase.walk():
671+
672+
resource_license_detections = []
673+
if has_licenses:
674+
license_detections = getattr(resource, 'license_detections', []) or []
675+
license_clues = getattr(resource, 'license_clues', []) or []
676+
677+
if license_detections:
678+
license_detection_objects = detections_from_license_detection_mappings(
679+
license_detection_mappings=license_detections,
680+
file_path=resource.path,
681+
)
682+
resource_license_detections.extend(license_detection_objects)
683+
684+
if include_license_clues and license_clues:
685+
license_matches = LicenseMatchFromResult.from_dicts(
686+
license_match_mappings=license_clues,
687+
)
688+
689+
for group_of_matches in group_matches(license_matches=license_matches):
690+
detection = LicenseDetection.from_matches(matches=group_of_matches)
691+
detection.file_region = detection.get_file_region(path=resource.path)
692+
resource_license_detections.append(detection)
693+
694+
all_license_detections.extend(
695+
list(process_detections(detections=resource_license_detections))
696+
)
697+
698+
if TRACE:
699+
logger_debug(
700+
f'before process_detections licenses:',
701+
f'resource_license_detections: {resource_license_detections}\n',
702+
f'all_license_detections: {all_license_detections}',
703+
)
704+
705+
if has_packages:
706+
package_data = getattr(resource, 'package_data', []) or []
707+
708+
package_license_detection_mappings = []
709+
for package in package_data:
710+
711+
if package["license_detections"]:
712+
package_license_detection_mappings.extend(package["license_detections"])
713+
714+
if package["other_license_detections"]:
715+
package_license_detection_mappings.extend(package["other_license_detections"])
716+
717+
if package_license_detection_mappings:
718+
package_license_detection_objects = detections_from_license_detection_mappings(
719+
license_detection_mappings=package_license_detection_mappings,
720+
file_path=resource.path,
721+
)
722+
723+
all_license_detections.extend(package_license_detection_objects)
724+
725+
return all_license_detections
726+
616727

617728
@attr.s
618729
class UniqueDetection:
@@ -624,7 +735,7 @@ class UniqueDetection:
624735
detection_count = attr.ib(default=None)
625736
matches = attr.ib(default=attr.Factory(list))
626737
detection_log = attr.ib(default=attr.Factory(list))
627-
files = attr.ib(factory=list)
738+
file_regions = attr.ib(factory=list)
628739

629740
@classmethod
630741
def get_unique_detections(cls, license_detections):
@@ -640,17 +751,30 @@ def get_unique_detections(cls, license_detections):
640751
detection.file_region
641752
for detection in all_detections
642753
]
643-
644754
detection = next(iter(all_detections))
645-
detection_mapping = detection.to_dict()
755+
detection_log = []
756+
if hasattr(detection, "detection_log"):
757+
if detection.detection_log:
758+
detection_log.extend(detection.detection_log)
759+
760+
if not detection.license_expression:
761+
detection.license_expression = str(combine_expressions(
762+
expressions=[
763+
match.rule.license_expression
764+
for match in detection.matches
765+
]
766+
))
767+
detection.identifier = detection.identifier_with_expression
768+
769+
646770
unique_license_detections.append(
647771
cls(
648-
identifier=detection_mapping["identifier"],
649-
license_expression=detection_mapping["license_expression"],
650-
detection_log=detection_mapping.get("detection_log", []) or [],
651-
matches=detection_mapping["matches"],
772+
identifier=detection.identifier,
773+
license_expression=detection.license_expression,
774+
detection_log=detection_log or [],
775+
matches=detection.matches,
652776
detection_count=len(file_regions),
653-
files=file_regions,
777+
file_regions=file_regions,
654778
)
655779
)
656780

@@ -660,7 +784,7 @@ def to_dict(self, license_diagnostics):
660784

661785
def dict_fields(attr, value):
662786

663-
if attr.name == 'files':
787+
if attr.name == 'file_regions':
664788
return False
665789

666790
if attr.name == 'matches':
@@ -673,6 +797,15 @@ def dict_fields(attr, value):
673797

674798
return attr.asdict(self, filter=dict_fields)
675799

800+
def get_license_detection_object(self):
801+
return LicenseDetection(
802+
license_expression=self.license_expression,
803+
detection_log=self.detection_log,
804+
matches= self.matches,
805+
identifier=self.identifier,
806+
file_region=None,
807+
)
808+
676809

677810
def get_detections_by_id(license_detections):
678811
"""
@@ -795,6 +928,17 @@ def has_extra_words(license_matches):
795928
)
796929

797930

931+
def has_low_rule_relevance(license_matches):
932+
"""
933+
Return True if any on the matches in ``license_matches`` List of LicenseMatch
934+
objects has a match with low score because of low rule relevance.
935+
"""
936+
return any(
937+
license_match.rule.relevance < LOW_RELEVANCE_THRESHOLD
938+
for license_match in license_matches
939+
)
940+
941+
798942
def is_false_positive(license_matches, package_license=False):
799943
"""
800944
Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1215,6 +1359,41 @@ def get_license_keys_from_detections(license_detections, licensing=Licensing()):
12151359
return list(license_keys)
12161360

12171361

1362+
def get_ambiguous_license_detections_by_type(unique_license_detections):
1363+
"""
1364+
Return a list of ambiguous unique license detections which needs review
1365+
and would be todo items for the reviewer from a list of
1366+
`unique_license_detections`.
1367+
"""
1368+
1369+
ambi_license_detections = {}
1370+
1371+
for detection in unique_license_detections:
1372+
if not detection.license_expression:
1373+
ambi_license_detections[DetectionCategory.MATCH_FRAGMENTS.value] = detection
1374+
1375+
elif is_undetected_license_matches(license_matches=detection.matches):
1376+
ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection
1377+
1378+
elif "unknown" in detection.license_expression:
1379+
if has_unknown_matches(license_matches=detection.matches):
1380+
ambi_license_detections[DetectionCategory.UNKNOWN_MATCH.value] = detection
1381+
1382+
elif is_match_coverage_less_than_threshold(
1383+
license_matches=detection.matches,
1384+
threshold=IMPERFECT_MATCH_COVERAGE_THR,
1385+
):
1386+
ambi_license_detections[DetectionCategory.IMPERFECT_COVERAGE.value] = detection
1387+
1388+
elif has_extra_words(license_matches=detection.matches):
1389+
ambi_license_detections[DetectionCategory.EXTRA_WORDS.value] = detection
1390+
1391+
elif has_low_rule_relevance(license_matches=detection.matches):
1392+
ambi_license_detections[DetectionCategory.LOW_RELEVANCE.value] = detection
1393+
1394+
return ambi_license_detections
1395+
1396+
12181397
def analyze_detection(license_matches, package_license=False):
12191398
"""
12201399
Analyse a list of LicenseMatch objects, and determine if the license detection

src/licensedcode/plugin_license.py

Lines changed: 1 addition & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,14 @@
1919
from plugincode.scan import scan_impl
2020

2121
from licensedcode.cache import build_spdx_license_expression, get_cache
22+
from licensedcode.detection import collect_license_detections
2223
from licensedcode.detection import find_referenced_resource
2324
from licensedcode.detection import get_detected_license_expression
2425
from licensedcode.detection import get_matches_from_detection_mappings
2526
from licensedcode.detection import get_new_identifier_from_detections
2627
from licensedcode.detection import get_referenced_filenames
27-
from licensedcode.detection import group_matches
28-
from licensedcode.detection import process_detections
2928
from licensedcode.detection import DetectionCategory
30-
from licensedcode.detection import detections_from_license_detection_mappings
31-
from licensedcode.detection import LicenseDetection
3229
from licensedcode.detection import LicenseDetectionFromResult
33-
from licensedcode.detection import LicenseMatchFromResult
3430
from licensedcode.detection import UniqueDetection
3531
from packagedcode.utils import combine_expressions
3632
from scancode.api import SCANCODE_LICENSEDB_URL
@@ -227,72 +223,6 @@ def process_codebase(self, codebase, license_diagnostics, **kwargs):
227223
])
228224

229225

230-
def collect_license_detections(codebase, include_license_clues=True):
231-
"""
232-
Return a list of LicenseDetectionFromResult from a ``codebase``
233-
"""
234-
has_packages = hasattr(codebase.root, 'package_data')
235-
has_licenses = hasattr(codebase.root, 'license_detections')
236-
237-
all_license_detections = []
238-
239-
for resource in codebase.walk():
240-
241-
resource_license_detections = []
242-
if has_licenses:
243-
license_detections = getattr(resource, 'license_detections', []) or []
244-
license_clues = getattr(resource, 'license_clues', []) or []
245-
246-
if license_detections:
247-
license_detection_objects = detections_from_license_detection_mappings(
248-
license_detection_mappings=license_detections,
249-
file_path=resource.path,
250-
)
251-
resource_license_detections.extend(license_detection_objects)
252-
253-
if include_license_clues and license_clues:
254-
license_matches = LicenseMatchFromResult.from_dicts(
255-
license_match_mappings=license_clues,
256-
)
257-
258-
for group_of_matches in group_matches(license_matches=license_matches):
259-
detection = LicenseDetection.from_matches(matches=group_of_matches)
260-
detection.file_region = detection.get_file_region(path=resource.path)
261-
resource_license_detections.append(detection)
262-
263-
all_license_detections.extend(
264-
list(process_detections(detections=resource_license_detections))
265-
)
266-
267-
if TRACE:
268-
logger_debug(
269-
f'before process_detections licenses:',
270-
f'resource_license_detections: {resource_license_detections}\n',
271-
f'all_license_detections: {all_license_detections}',
272-
)
273-
274-
if has_packages:
275-
package_data = getattr(resource, 'package_data', []) or []
276-
277-
package_license_detection_mappings = []
278-
for package in package_data:
279-
280-
if package["license_detections"]:
281-
package_license_detection_mappings.extend(package["license_detections"])
282-
283-
if package["other_license_detections"]:
284-
package_license_detection_mappings.extend(package["other_license_detections"])
285-
286-
if package_license_detection_mappings:
287-
package_license_detection_objects = detections_from_license_detection_mappings(
288-
license_detection_mappings=package_license_detection_mappings,
289-
file_path=resource.path,
290-
)
291-
292-
all_license_detections.extend(package_license_detection_objects)
293-
294-
return all_license_detections
295-
296226

297227
def add_referenced_filenames_license_matches_for_detections(resource, codebase):
298228
"""

0 commit comments

Comments
 (0)