Skip to content

Commit 69d662c

Browse files
Ambiguous Detections ToDo items #3122
Signed-off-by: Ayan Sinha Mahapatra <ayansmahapatra@gmail.com>
1 parent 5213880 commit 69d662c

File tree

17 files changed

+1781
-81
lines changed

17 files changed

+1781
-81
lines changed

setup.cfg

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ scancode_post_scan =
196196
filter-clues = cluecode.plugin_filter_clues:RedundantCluesFilter
197197
consolidate = summarycode.plugin_consolidate:Consolidator
198198
license-references = licensedcode.licenses_reference:LicenseReference
199+
todo = summarycode.todo:AmbiguousDetectionsToDoPlugin
199200

200201

201202
# scancode_output_filter is the entry point for filter plugins executed after

src/licensedcode/detection.py

Lines changed: 152 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ class DetectionCategory(Enum):
104104
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
105105
FALSE_POSITVE = 'possible-false-positive'
106106
UNDETECTED_LICENSE = 'undetected-license'
107+
MATCH_FRAGMENTS = 'match_fragments'
107108

108109

109110
class DetectionRule(Enum):
@@ -141,6 +142,9 @@ class FileRegion:
141142
start_line = attr.ib(type=int)
142143
end_line = attr.ib(type=int)
143144

145+
def to_dict(self):
146+
return attr.asdict(self, dict_factory=dict)
147+
144148

145149
@attr.s(slots=True, eq=False, order=False)
146150
class LicenseDetection:
@@ -613,6 +617,106 @@ def from_dicts(cls, license_match_mappings):
613617
"""
614618
return [LicenseMatchFromResult.from_dict(lmm) for lmm in license_match_mappings]
615619

620+
def to_dict(
621+
self,
622+
include_text=False,
623+
license_text_diagnostics=False,
624+
whole_lines=True,
625+
):
626+
"""
627+
Return a "result" scan data built from a LicenseMatch object.
628+
"""
629+
matched_text = None
630+
if include_text:
631+
matched_text = self.matched_text
632+
633+
result = {}
634+
635+
# Detection Level Information
636+
result['score'] = self.score()
637+
result['start_line'] = self.start_line
638+
result['end_line'] = self.end_line
639+
result['matched_length'] = self.len()
640+
result['match_coverage'] = self.coverage()
641+
result['matcher'] = self.matcher
642+
643+
# LicenseDB Level Information (Rule that was matched)
644+
result['license_expression'] = self.rule.license_expression
645+
result['rule_identifier'] = self.rule.identifier
646+
result['rule_relevance'] = self.rule.relevance
647+
result['rule_url'] = self.rule.rule_url
648+
649+
if include_text:
650+
result['matched_text'] = matched_text
651+
return result
652+
653+
654+
def collect_license_detections(codebase, include_license_clues=True):
655+
"""
656+
Return a list of LicenseDetectionFromResult from a ``codebase``
657+
"""
658+
has_packages = hasattr(codebase.root, 'package_data')
659+
has_licenses = hasattr(codebase.root, 'license_detections')
660+
661+
all_license_detections = []
662+
663+
for resource in codebase.walk():
664+
665+
resource_license_detections = []
666+
if has_licenses:
667+
license_detections = getattr(resource, 'license_detections', []) or []
668+
license_clues = getattr(resource, 'license_clues', []) or []
669+
670+
if license_detections:
671+
license_detection_objects = detections_from_license_detection_mappings(
672+
license_detection_mappings=license_detections,
673+
file_path=resource.path,
674+
)
675+
resource_license_detections.extend(license_detection_objects)
676+
677+
if include_license_clues and license_clues:
678+
license_matches = LicenseMatchFromResult.from_dicts(
679+
license_match_mappings=license_clues,
680+
)
681+
682+
for group_of_matches in group_matches(license_matches=license_matches):
683+
detection = LicenseDetection.from_matches(matches=group_of_matches)
684+
detection.file_region = detection.get_file_region(path=resource.path)
685+
resource_license_detections.append(detection)
686+
687+
all_license_detections.extend(
688+
list(process_detections(detections=resource_license_detections))
689+
)
690+
691+
if TRACE:
692+
logger_debug(
693+
f'before process_detections licenses:',
694+
f'resource_license_detections: {resource_license_detections}\n',
695+
f'all_license_detections: {all_license_detections}',
696+
)
697+
698+
if has_packages:
699+
package_data = getattr(resource, 'package_data', []) or []
700+
701+
package_license_detection_mappings = []
702+
for package in package_data:
703+
704+
if package["license_detections"]:
705+
package_license_detection_mappings.extend(package["license_detections"])
706+
707+
if package["other_license_detections"]:
708+
package_license_detection_mappings.extend(package["other_license_detections"])
709+
710+
if package_license_detection_mappings:
711+
package_license_detection_objects = detections_from_license_detection_mappings(
712+
license_detection_mappings=package_license_detection_mappings,
713+
file_path=resource.path,
714+
)
715+
716+
all_license_detections.extend(package_license_detection_objects)
717+
718+
return all_license_detections
719+
616720

617721
@attr.s
618722
class UniqueDetection:
@@ -624,7 +728,7 @@ class UniqueDetection:
624728
detection_count = attr.ib(default=None)
625729
matches = attr.ib(default=attr.Factory(list))
626730
detection_log = attr.ib(default=attr.Factory(list))
627-
files = attr.ib(factory=list)
731+
file_regions = attr.ib(factory=list)
628732

629733
@classmethod
630734
def get_unique_detections(cls, license_detections):
@@ -640,17 +744,18 @@ def get_unique_detections(cls, license_detections):
640744
detection.file_region
641745
for detection in all_detections
642746
]
643-
644747
detection = next(iter(all_detections))
645-
detection_mapping = detection.to_dict()
748+
if not hasattr(detection, "detection_log"):
749+
detection.detection_log = []
750+
646751
unique_license_detections.append(
647752
cls(
648-
identifier=detection_mapping["identifier"],
649-
license_expression=detection_mapping["license_expression"],
650-
detection_log=detection_mapping.get("detection_log", []) or [],
651-
matches=detection_mapping["matches"],
753+
identifier=detection.identifier,
754+
license_expression=detection.license_expression,
755+
detection_log=detection.detection_log,
756+
matches=detection.matches,
652757
detection_count=len(file_regions),
653-
files=file_regions,
758+
file_regions=file_regions,
654759
)
655760
)
656761

@@ -660,7 +765,7 @@ def to_dict(self, license_diagnostics):
660765

661766
def dict_fields(attr, value):
662767

663-
if attr.name == 'files':
768+
if attr.name == 'file_regions':
664769
return False
665770

666771
if attr.name == 'matches':
@@ -673,6 +778,15 @@ def dict_fields(attr, value):
673778

674779
return attr.asdict(self, filter=dict_fields)
675780

781+
def get_license_detection_object(self):
782+
return LicenseDetection(
783+
license_expression=self.license_expression,
784+
detection_log=self.detection_log,
785+
matches= self.matches,
786+
identifier=self.identifier,
787+
file_region=None,
788+
)
789+
676790

677791
def get_detections_by_id(license_detections):
678792
"""
@@ -1215,6 +1329,35 @@ def get_license_keys_from_detections(license_detections, licensing=Licensing()):
12151329
return list(license_keys)
12161330

12171331

1332+
def get_ambiguous_license_detections_by_type(unique_license_detections):
1333+
"""
1334+
Return a list of ambiguous unique license detections which needs review
1335+
and would be todo items for the reviewer from a list of
1336+
`unique_license_detections`.
1337+
"""
1338+
1339+
ambi_license_detections = {}
1340+
1341+
for detection in unique_license_detections:
1342+
if is_undetected_license_matches(license_matches=detection.matches):
1343+
ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection
1344+
1345+
elif "unknown" in detection.license_expression:
1346+
if has_unknown_matches(license_matches=detection.matches):
1347+
ambi_license_detections[DetectionCategory.UNKNOWN_MATCH.value] = detection
1348+
1349+
elif is_match_coverage_less_than_threshold(
1350+
license_matches=detection.matches,
1351+
threshold=IMPERFECT_MATCH_COVERAGE_THR,
1352+
):
1353+
ambi_license_detections[DetectionCategory.IMPERFECT_COVERAGE.value] = detection
1354+
1355+
elif has_extra_words(license_matches=detection.matches):
1356+
ambi_license_detections[DetectionCategory.EXTRA_WORDS.value] = detection
1357+
1358+
return ambi_license_detections
1359+
1360+
12181361
def analyze_detection(license_matches, package_license=False):
12191362
"""
12201363
Analyse a list of LicenseMatch objects, and determine if the license detection

src/licensedcode/plugin_license.py

Lines changed: 1 addition & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,14 @@
1919
from plugincode.scan import scan_impl
2020

2121
from licensedcode.cache import build_spdx_license_expression, get_cache
22+
from licensedcode.detection import collect_license_detections
2223
from licensedcode.detection import find_referenced_resource
2324
from licensedcode.detection import get_detected_license_expression
2425
from licensedcode.detection import get_matches_from_detection_mappings
2526
from licensedcode.detection import get_new_identifier_from_detections
2627
from licensedcode.detection import get_referenced_filenames
27-
from licensedcode.detection import group_matches
28-
from licensedcode.detection import process_detections
2928
from licensedcode.detection import DetectionCategory
30-
from licensedcode.detection import detections_from_license_detection_mappings
31-
from licensedcode.detection import LicenseDetection
3229
from licensedcode.detection import LicenseDetectionFromResult
33-
from licensedcode.detection import LicenseMatchFromResult
3430
from licensedcode.detection import UniqueDetection
3531
from packagedcode.utils import combine_expressions
3632
from scancode.api import SCANCODE_LICENSEDB_URL
@@ -227,72 +223,6 @@ def process_codebase(self, codebase, license_diagnostics, **kwargs):
227223
])
228224

229225

230-
def collect_license_detections(codebase, include_license_clues=True):
231-
"""
232-
Return a list of LicenseDetectionFromResult from a ``codebase``
233-
"""
234-
has_packages = hasattr(codebase.root, 'package_data')
235-
has_licenses = hasattr(codebase.root, 'license_detections')
236-
237-
all_license_detections = []
238-
239-
for resource in codebase.walk():
240-
241-
resource_license_detections = []
242-
if has_licenses:
243-
license_detections = getattr(resource, 'license_detections', []) or []
244-
license_clues = getattr(resource, 'license_clues', []) or []
245-
246-
if license_detections:
247-
license_detection_objects = detections_from_license_detection_mappings(
248-
license_detection_mappings=license_detections,
249-
file_path=resource.path,
250-
)
251-
resource_license_detections.extend(license_detection_objects)
252-
253-
if include_license_clues and license_clues:
254-
license_matches = LicenseMatchFromResult.from_dicts(
255-
license_match_mappings=license_clues,
256-
)
257-
258-
for group_of_matches in group_matches(license_matches=license_matches):
259-
detection = LicenseDetection.from_matches(matches=group_of_matches)
260-
detection.file_region = detection.get_file_region(path=resource.path)
261-
resource_license_detections.append(detection)
262-
263-
all_license_detections.extend(
264-
list(process_detections(detections=resource_license_detections))
265-
)
266-
267-
if TRACE:
268-
logger_debug(
269-
f'before process_detections licenses:',
270-
f'resource_license_detections: {resource_license_detections}\n',
271-
f'all_license_detections: {all_license_detections}',
272-
)
273-
274-
if has_packages:
275-
package_data = getattr(resource, 'package_data', []) or []
276-
277-
package_license_detection_mappings = []
278-
for package in package_data:
279-
280-
if package["license_detections"]:
281-
package_license_detection_mappings.extend(package["license_detections"])
282-
283-
if package["other_license_detections"]:
284-
package_license_detection_mappings.extend(package["other_license_detections"])
285-
286-
if package_license_detection_mappings:
287-
package_license_detection_objects = detections_from_license_detection_mappings(
288-
license_detection_mappings=package_license_detection_mappings,
289-
file_path=resource.path,
290-
)
291-
292-
all_license_detections.extend(package_license_detection_objects)
293-
294-
return all_license_detections
295-
296226

297227
def add_referenced_filenames_license_matches_for_detections(resource, codebase):
298228
"""

src/summarycode/plugin_consolidate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def process_codebase(self, codebase, **kwargs):
170170
# TODO: Have a "catch-all" Component for the things that we haven't grouped
171171
consolidations = []
172172
root = codebase.root
173-
if hasattr(root, 'packages') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
173+
if hasattr(root, 'package_data') and hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
174174
consolidations.extend(get_consolidated_packages(codebase))
175175
if hasattr(root, 'copyrights') and hasattr(root, 'license_detections'):
176176
consolidations.extend(get_holders_consolidated_components(codebase))

0 commit comments

Comments
 (0)