@@ -76,6 +76,9 @@ def logger_debug(*args):
7676# Values of match_coverage less than this are reported as `license_clues` matches
7777CLUES_MATCH_COVERAGE_THR = 60
7878
79+ # Low Relevance threshold
80+ LOW_RELEVANCE_THRESHOLD = 70
81+
7982# False positives to spurious and gibberish texts are found usually later in the file
8083# and matched to relatively short rules
8184# Threshold Value of start line after which a match to likely be a false positive
@@ -104,6 +107,8 @@ class DetectionCategory(Enum):
104107 IMPERFECT_COVERAGE = 'imperfect-match-coverage'
105108 FALSE_POSITVE = 'possible-false-positive'
106109 UNDETECTED_LICENSE = 'undetected-license'
110+ MATCH_FRAGMENTS = 'match-fragments'
111+ LOW_RELEVANCE = 'low-relevance'
107112
108113
109114class DetectionRule (Enum ):
@@ -141,6 +146,9 @@ class FileRegion:
141146 start_line = attr .ib (type = int )
142147 end_line = attr .ib (type = int )
143148
149+ def to_dict (self ):
150+ return attr .asdict (self , dict_factory = dict )
151+
144152
145153@attr .s (slots = True , eq = False , order = False )
146154class LicenseDetection :
@@ -275,7 +283,10 @@ def _identifier(self):
275283 """
276284 data = []
277285 for match in self .matches :
278- tokenized_matched_text = tuple (query_tokenizer (match .matched_text ()))
286+ if isinstance (match .matched_text , str ):
287+ tokenized_matched_text = tuple (query_tokenizer (match .matched_text ))
288+ else :
289+ tokenized_matched_text = tuple (query_tokenizer (match .matched_text ()))
279290 identifier = (
280291 match .rule .identifier ,
281292 match .score (),
@@ -613,6 +624,106 @@ def from_dicts(cls, license_match_mappings):
613624 """
614625 return [LicenseMatchFromResult .from_dict (lmm ) for lmm in license_match_mappings ]
615626
627+ def to_dict (
628+ self ,
629+ include_text = False ,
630+ license_text_diagnostics = False ,
631+ whole_lines = True ,
632+ ):
633+ """
634+ Return a "result" scan data built from a LicenseMatch object.
635+ """
636+ matched_text = None
637+ if include_text :
638+ matched_text = self .matched_text
639+
640+ result = {}
641+
642+ # Detection Level Information
643+ result ['score' ] = self .score ()
644+ result ['start_line' ] = self .start_line
645+ result ['end_line' ] = self .end_line
646+ result ['matched_length' ] = self .len ()
647+ result ['match_coverage' ] = self .coverage ()
648+ result ['matcher' ] = self .matcher
649+
650+ # LicenseDB Level Information (Rule that was matched)
651+ result ['license_expression' ] = self .rule .license_expression
652+ result ['rule_identifier' ] = self .rule .identifier
653+ result ['rule_relevance' ] = self .rule .relevance
654+ result ['rule_url' ] = self .rule .rule_url
655+
656+ if include_text :
657+ result ['matched_text' ] = matched_text
658+ return result
659+
660+
661+ def collect_license_detections (codebase , include_license_clues = True ):
662+ """
663+ Return a list of LicenseDetectionFromResult from a ``codebase``
664+ """
665+ has_packages = hasattr (codebase .root , 'package_data' )
666+ has_licenses = hasattr (codebase .root , 'license_detections' )
667+
668+ all_license_detections = []
669+
670+ for resource in codebase .walk ():
671+
672+ resource_license_detections = []
673+ if has_licenses :
674+ license_detections = getattr (resource , 'license_detections' , []) or []
675+ license_clues = getattr (resource , 'license_clues' , []) or []
676+
677+ if license_detections :
678+ license_detection_objects = detections_from_license_detection_mappings (
679+ license_detection_mappings = license_detections ,
680+ file_path = resource .path ,
681+ )
682+ resource_license_detections .extend (license_detection_objects )
683+
684+ if include_license_clues and license_clues :
685+ license_matches = LicenseMatchFromResult .from_dicts (
686+ license_match_mappings = license_clues ,
687+ )
688+
689+ for group_of_matches in group_matches (license_matches = license_matches ):
690+ detection = LicenseDetection .from_matches (matches = group_of_matches )
691+ detection .file_region = detection .get_file_region (path = resource .path )
692+ resource_license_detections .append (detection )
693+
694+ all_license_detections .extend (
695+ list (process_detections (detections = resource_license_detections ))
696+ )
697+
698+ if TRACE :
699+ logger_debug (
700+ f'before process_detections licenses:' ,
701+ f'resource_license_detections: { resource_license_detections } \n ' ,
702+ f'all_license_detections: { all_license_detections } ' ,
703+ )
704+
705+ if has_packages :
706+ package_data = getattr (resource , 'package_data' , []) or []
707+
708+ package_license_detection_mappings = []
709+ for package in package_data :
710+
711+ if package ["license_detections" ]:
712+ package_license_detection_mappings .extend (package ["license_detections" ])
713+
714+ if package ["other_license_detections" ]:
715+ package_license_detection_mappings .extend (package ["other_license_detections" ])
716+
717+ if package_license_detection_mappings :
718+ package_license_detection_objects = detections_from_license_detection_mappings (
719+ license_detection_mappings = package_license_detection_mappings ,
720+ file_path = resource .path ,
721+ )
722+
723+ all_license_detections .extend (package_license_detection_objects )
724+
725+ return all_license_detections
726+
616727
617728@attr .s
618729class UniqueDetection :
@@ -624,7 +735,7 @@ class UniqueDetection:
624735 detection_count = attr .ib (default = None )
625736 matches = attr .ib (default = attr .Factory (list ))
626737 detection_log = attr .ib (default = attr .Factory (list ))
627- files = attr .ib (factory = list )
738+ file_regions = attr .ib (factory = list )
628739
629740 @classmethod
630741 def get_unique_detections (cls , license_detections ):
@@ -640,17 +751,30 @@ def get_unique_detections(cls, license_detections):
640751 detection .file_region
641752 for detection in all_detections
642753 ]
643-
644754 detection = next (iter (all_detections ))
645- detection_mapping = detection .to_dict ()
755+ detection_log = []
756+ if hasattr (detection , "detection_log" ):
757+ if detection .detection_log :
758+ detection_log .extend (detection .detection_log )
759+
760+ if not detection .license_expression :
761+ detection .license_expression = str (combine_expressions (
762+ expressions = [
763+ match .rule .license_expression
764+ for match in detection .matches
765+ ]
766+ ))
767+ detection .identifier = detection .identifier_with_expression
768+
769+
646770 unique_license_detections .append (
647771 cls (
648- identifier = detection_mapping [ " identifier" ] ,
649- license_expression = detection_mapping [ " license_expression" ] ,
650- detection_log = detection_mapping . get ( " detection_log" , []) or [],
651- matches = detection_mapping [ " matches" ] ,
772+ identifier = detection . identifier ,
773+ license_expression = detection . license_expression ,
774+ detection_log = detection_log or [],
775+ matches = detection . matches ,
652776 detection_count = len (file_regions ),
653- files = file_regions ,
777+ file_regions = file_regions ,
654778 )
655779 )
656780
@@ -660,7 +784,7 @@ def to_dict(self, license_diagnostics):
660784
661785 def dict_fields (attr , value ):
662786
663- if attr .name == 'files ' :
787+ if attr .name == 'file_regions ' :
664788 return False
665789
666790 if attr .name == 'matches' :
@@ -673,6 +797,15 @@ def dict_fields(attr, value):
673797
674798 return attr .asdict (self , filter = dict_fields )
675799
800+ def get_license_detection_object (self ):
801+ return LicenseDetection (
802+ license_expression = self .license_expression ,
803+ detection_log = self .detection_log ,
804+ matches = self .matches ,
805+ identifier = self .identifier ,
806+ file_region = None ,
807+ )
808+
676809
677810def get_detections_by_id (license_detections ):
678811 """
@@ -795,6 +928,17 @@ def has_extra_words(license_matches):
795928 )
796929
797930
931+ def has_low_rule_relevance (license_matches ):
932+ """
933+ Return True if any on the matches in ``license_matches`` List of LicenseMatch
934+ objects has a match with low score because of low rule relevance.
935+ """
936+ return any (
937+ license_match .rule .relevance < LOW_RELEVANCE_THRESHOLD
938+ for license_match in license_matches
939+ )
940+
941+
798942def is_false_positive (license_matches , package_license = False ):
799943 """
800944 Return True if all of the matches in ``license_matches`` List of LicenseMatch
@@ -1215,6 +1359,41 @@ def get_license_keys_from_detections(license_detections, licensing=Licensing()):
12151359 return list (license_keys )
12161360
12171361
1362+ def get_ambiguous_license_detections_by_type (unique_license_detections ):
1363+ """
1364+ Return a list of ambiguous unique license detections which needs review
1365+ and would be todo items for the reviewer from a list of
1366+ `unique_license_detections`.
1367+ """
1368+
1369+ ambi_license_detections = {}
1370+
1371+ for detection in unique_license_detections :
1372+ if not detection .license_expression :
1373+ ambi_license_detections [DetectionCategory .MATCH_FRAGMENTS .value ] = detection
1374+
1375+ elif is_undetected_license_matches (license_matches = detection .matches ):
1376+ ambi_license_detections [DetectionCategory .UNDETECTED_LICENSE .value ] = detection
1377+
1378+ elif "unknown" in detection .license_expression :
1379+ if has_unknown_matches (license_matches = detection .matches ):
1380+ ambi_license_detections [DetectionCategory .UNKNOWN_MATCH .value ] = detection
1381+
1382+ elif is_match_coverage_less_than_threshold (
1383+ license_matches = detection .matches ,
1384+ threshold = IMPERFECT_MATCH_COVERAGE_THR ,
1385+ ):
1386+ ambi_license_detections [DetectionCategory .IMPERFECT_COVERAGE .value ] = detection
1387+
1388+ elif has_extra_words (license_matches = detection .matches ):
1389+ ambi_license_detections [DetectionCategory .EXTRA_WORDS .value ] = detection
1390+
1391+ elif has_low_rule_relevance (license_matches = detection .matches ):
1392+ ambi_license_detections [DetectionCategory .LOW_RELEVANCE .value ] = detection
1393+
1394+ return ambi_license_detections
1395+
1396+
12181397def analyze_detection (license_matches , package_license = False ):
12191398 """
12201399 Analyse a list of LicenseMatch objects, and determine if the license detection
0 commit comments