@@ -94,6 +94,9 @@ class DetectionCategory(Enum):
9494
9595class DetectionRule (Enum ):
9696 NOT_COMBINED = 'not-combined'
97+ UNKNOWN_MATCH = 'unknown-match'
98+ LICENSE_CLUES = 'license-clues'
99+ FALSE_POSITIVE = 'false-positive'
97100 UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'unknown-reference-to-local-file'
98101 UNKNOWN_INTRO_FOLLOWED_BY_MATCH = 'unknown-intro-followed-by-match'
99102 CONTAINED_SAME_LICENSE = 'contained-with-same-license'
@@ -165,15 +168,24 @@ class LicenseDetection:
165168 )
166169
167170 @classmethod
168- def from_matches (cls , matches , analysis = None , post_scan = False ):
171+ def from_matches (
172+ cls ,
173+ matches ,
174+ analysis = None ,
175+ post_scan = False ,
176+ package_license = False ,
177+ ):
169178 """
170179 Return a LicenseDetection created out of `matches` list of LicenseMatch.
171180 """
172181 if not matches :
173182 return
174-
183+
175184 if analysis is None :
176- analysis = analyze_detection (matches )
185+ analysis = analyze_detection (
186+ license_matches = matches ,
187+ package_license = package_license
188+ )
177189
178190 reasons , license_expression = get_detected_license_expression (
179191 matches = matches ,
@@ -438,11 +450,17 @@ def is_correct_detection(license_matches):
438450 )
439451
440452
441- def is_match_coverage_less_than_threshold (license_matches , threshold ):
453+ def is_match_coverage_less_than_threshold (license_matches , threshold , any_matches = True ):
442454 """
443455 Return True if any of the matches in `license_matches` List of LicenseMatch
444456 has a `match_coverage` value below the threshold (a value between 0-100).
445457 """
458+ if not any_matches :
459+ return not any (
460+ license_match .coverage () > threshold
461+ for license_match in license_matches
462+ )
463+
446464 return any (
447465 license_match .coverage () < threshold
448466 for license_match in license_matches
@@ -480,35 +498,60 @@ def has_extra_words(license_matches):
480498 )
481499
482500
483- def is_false_positive (license_matches ):
501+ def is_false_positive (license_matches , package_license = False ):
484502 """
485503 Return True if all of the matches in `license_matches` List of LicenseMatch
486504 are false positives.
487505
488506 False Positive occurs when other text/code is falsely matched to a license rule,
489507 """
508+ if package_license :
509+ return False
510+
490511 start_line_region = min (
491512 license_match .start_line for license_match in license_matches
492513 )
493514 match_rule_length_values = [
494515 license_match .rule .length for license_match in license_matches
495516 ]
496517
518+ all_match_rule_length_one = all (
519+ match_rule_length == 1
520+ for match_rule_length in match_rule_length_values
521+ )
522+
523+ is_gpl_bare = all (
524+ 'gpl_bare' in license_match .rule .identifier
525+ for license_match in license_matches
526+ )
527+
528+ is_gpl = all (
529+ 'gpl' in license_match .rule .identifier
530+ for license_match in license_matches
531+ )
532+
533+ matches_is_license_tag_flags = all (
534+ license_match .rule .is_license_tag for license_match in license_matches
535+ )
536+
537+ is_single_match = len (license_matches ) == 1
538+
539+ if is_single_match and is_gpl_bare :
540+ return True
541+
542+ if is_gpl and all_match_rule_length_one :
543+ return True
544+
497545 if start_line_region > FALSE_POSITIVE_START_LINE_THRESHOLD and any (
498546 match_rule_length_value <= FALSE_POSITIVE_RULE_LENGTH_THRESHOLD
499547 for match_rule_length_value in match_rule_length_values
500548 ):
501549 return True
502550
503- match_is_license_tag_flags = (
504- license_match .rule .is_license_tag for license_match in license_matches
505- )
506- return all (
507- (is_license_tag_flag and match_rule_length == 1 )
508- for is_license_tag_flag , match_rule_length in zip (
509- match_is_license_tag_flags , match_rule_length_values
510- )
511- )
551+ if matches_is_license_tag_flags and all_match_rule_length_one :
552+ return True
553+
554+ return False
512555
513556
514557def has_unknown_matches (license_matches ):
@@ -531,18 +574,29 @@ def is_unknown_intro(license_match):
531574
532575def is_license_clues (license_matches ):
533576 """
577+ Return True if the license_matches are not part of a correct
578+ license detection and are mere license clues.
534579 """
535580 return not is_correct_detection (license_matches ) and (
536581 has_unknown_matches (license_matches ) or
537582 is_match_coverage_less_than_threshold (
538583 license_matches = license_matches ,
539584 threshold = CLUES_MATCH_COVERAGE_THR ,
585+ any_matches = False ,
540586 )
541587 )
542588
543589
544590def has_unknown_intro_before_detection (license_matches ):
545591
592+ if len (license_matches ) == 1 :
593+ return False
594+
595+ if all ([
596+ is_unknown_intro (match ) for match in license_matches
597+ ]):
598+ return False
599+
546600 has_unknown_intro = False
547601 has_unknown_intro_before_detection = False
548602
@@ -552,7 +606,21 @@ def has_unknown_intro_before_detection(license_matches):
552606 continue
553607
554608 if has_unknown_intro :
555- has_unknown_intro_before_detection = True
609+ if not is_match_coverage_less_than_threshold (
610+ [match ], IMPERFECT_MATCH_COVERAGE_THR
611+ ) and not has_unknown_matches ([match ]):
612+ has_unknown_intro_before_detection = True
613+ return has_unknown_intro_before_detection
614+
615+ if has_unknown_intro :
616+ filtered_matches = filter_license_intros (license_matches )
617+ if license_matches != filtered_matches :
618+ if is_match_coverage_less_than_threshold (
619+ license_matches = filtered_matches ,
620+ threshold = IMPERFECT_MATCH_COVERAGE_THR ,
621+ any_matches = False ,
622+ ):
623+ has_unknown_intro_before_detection = True
556624
557625 return has_unknown_intro_before_detection
558626
@@ -568,7 +636,11 @@ def filter_license_intros(license_matches):
568636 license notice. In these cases, the license introduction can be discarded as
569637 this is for the license match that follows it.
570638 """
571- return [match for match in license_matches if not is_license_intro (match )]
639+ filtered_matches = [match for match in license_matches if not is_license_intro (match )]
640+ if not filtered_matches :
641+ return license_matches
642+ else :
643+ return filtered_matches
572644
573645
574646def is_license_intro (license_match ):
@@ -592,15 +664,26 @@ def is_license_reference_local_file(license_match):
592664 Return True if `license_match` LicenseMatch dict has a non-empty `referenced_filename`,
593665 i.e. contains a license reference to a local file.
594666 """
595- return bool (license_match ['referenced_filenames' ])
667+ if type (license_match ) == dict :
668+ return bool (license_match ['referenced_filenames' ])
669+ else :
670+ return bool (license_match .rule .referenced_filenames )
596671
597672
598673def filter_license_references (license_matches ):
599674 """
600675 Return a filtered ``license_matches`` list of LicenseMatch objects removing
601676 references to local files with licenses.
602677 """
603- return [match for match in license_matches if not is_license_reference_local_file (match )]
678+
679+ filtered_matches = [match for match in license_matches if not is_license_reference_local_file (match )]
680+ if TRACE :
681+ logger_debug (f"detection: filter_license_references: license_matches: { license_matches } : filtered_matches: { filtered_matches } " )
682+
683+ if not filtered_matches :
684+ return license_matches
685+ else :
686+ return filtered_matches
604687
605688
606689def has_unknown_references_to_local_files (license_matches ):
@@ -615,44 +698,57 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
615698 Return a tuple of (reasons, combined_expression) by combining a `matches` list of
616699 LicenseMatch objects using an `analysis` code string.
617700 """
701+ if TRACE :
702+ logger_debug (f'license_matches { matches } ' , f'package_license { analysis } ' , f'post_scan: { post_scan } ' )
703+
618704 matches_for_expression = None
619705 combined_expression = None
620706 reasons = []
621707
622- if analysis == DetectionCategory .UNDETECTED_LICENSE .value :
708+ if analysis == DetectionCategory .FALSE_POSITVE .value :
709+ reasons .append (DetectionRule .FALSE_POSITIVE .value )
710+ return reasons , combined_expression
711+
712+ elif analysis == DetectionCategory .UNDETECTED_LICENSE .value :
623713 matches_for_expression = matches
624714 reasons .append (DetectionRule .UNDETECTED_LICENSE .value )
625715
626716 elif analysis == DetectionCategory .UNKNOWN_INTRO_BEFORE_DETECTION .value :
627717 matches_for_expression = filter_license_intros (matches )
628718 reasons .append (DetectionRule .UNKNOWN_INTRO_FOLLOWED_BY_MATCH .value )
629719
630- elif analysis == DetectionCategory .UNKNOWN_FILE_REFERENCE_LOCAL .value and post_scan :
631- matches_for_expression = filter_license_references (matches )
632- reasons .append (DetectionRule .UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
720+ elif post_scan :
721+ if analysis == DetectionCategory .UNKNOWN_FILE_REFERENCE_LOCAL .value :
722+ matches_for_expression = filter_license_references (matches )
723+ reasons .append (DetectionRule .UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
633724
634- elif analysis == DetectionCategory .PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL .value and post_scan :
635- matches_for_expression = filter_license_references (matches )
636- reasons .append (DetectionRule .PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
725+ elif analysis == DetectionCategory .PACKAGE_UNKNOWN_FILE_REFERENCE_LOCAL .value :
726+ matches_for_expression = filter_license_references (matches )
727+ reasons .append (DetectionRule .PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE .value )
637728
638- elif analysis == DetectionCategory .PACKAGE_ADD_FROM_SIBLING_FILE and post_scan :
639- matches_for_expression = filter_license_references (matches )
640- reasons .append (DetectionRule .PACKAGE_ADD_FROM_SIBLING_FILE .value )
729+ elif analysis == DetectionCategory .PACKAGE_ADD_FROM_SIBLING_FILE . value :
730+ matches_for_expression = filter_license_references (matches )
731+ reasons .append (DetectionRule .PACKAGE_ADD_FROM_SIBLING_FILE .value )
641732
642- elif analysis == DetectionCategory .PACKAGE_ADD_FROM_FILE .value and post_scan :
643- matches_for_expression = filter_license_references (matches )
644- reasons .append (DetectionRule .PACKAGE_ADD_FROM_FILE .value )
733+ elif analysis == DetectionCategory .PACKAGE_ADD_FROM_FILE .value :
734+ matches_for_expression = filter_license_references (matches )
735+ reasons .append (DetectionRule .PACKAGE_ADD_FROM_FILE .value )
645736
646- elif (
647- analysis == DetectionCategory .UNKNOWN_MATCH .value or
648- analysis == DetectionCategory .LICENSE_CLUES .value
649- ):
737+ elif analysis == DetectionCategory .UNKNOWN_MATCH .value :
738+ reasons .append (DetectionRule .UNKNOWN_MATCH .value )
739+ return reasons , combined_expression
740+
741+ elif analysis == DetectionCategory .LICENSE_CLUES .value :
742+ reasons .append (DetectionRule .LICENSE_CLUES .value )
650743 return reasons , combined_expression
651744
652745 else :
653746 matches_for_expression = matches
654747 reasons .append (DetectionRule .NOT_COMBINED .value )
655748
749+ if TRACE :
750+ logger_debug (f'matches_for_expression: { matches_for_expression } ' , f'reasons: { reasons } ' )
751+
656752 if isinstance (matches [0 ], dict ):
657753 combined_expression = combine_expressions (
658754 expressions = [match ['license_expression' ] for match in matches_for_expression ]
@@ -662,6 +758,9 @@ def get_detected_license_expression(matches, analysis, post_scan=False):
662758 expressions = [match .rule .license_expression for match in matches_for_expression ]
663759 )
664760
761+ if TRACE :
762+ logger_debug (f'combined_expression { combined_expression } ' )
763+
665764 return reasons , combined_expression
666765
667766
@@ -792,12 +891,15 @@ def get_license_keys_from_detections(license_detections):
792891 return list (license_keys )
793892
794893
795- def analyze_detection (license_matches ):
894+ def analyze_detection (license_matches , package_license = False ):
796895 """
797896 Analyse a list of LicenseMatch objects, and determine if the license detection
798897 is correct or it is wrong/partially-correct/false-positive/has extra words or
799898 some other detection case.
800899 """
900+ if TRACE :
901+ logger_debug (f'license_matches { license_matches } ' , f'package_license { package_license } ' )
902+
801903 if is_undetected_license_matches (license_matches ):
802904 return DetectionCategory .UNDETECTED_LICENSE .value
803905
@@ -811,9 +913,12 @@ def analyze_detection(license_matches):
811913 elif is_correct_detection (license_matches ):
812914 return DetectionCategory .PERFECT_DETECTION .value
813915
814- elif is_match_coverage_less_than_threshold (
815- license_matches , CLUES_MATCH_COVERAGE_THR
816- ):
916+ # Case where the match is a false positive
917+ # In package license detection this is turned off
918+ elif not package_license and is_false_positive (license_matches , package_license ):
919+ return DetectionCategory .FALSE_POSITVE .value
920+
921+ elif is_license_clues (license_matches ):
817922 return DetectionCategory .LICENSE_CLUES .value
818923
819924 # Case where at least one of the matches have `match_coverage`
@@ -832,10 +937,6 @@ def analyze_detection(license_matches):
832937 elif has_unknown_matches (license_matches ):
833938 return DetectionCategory .UNKNOWN_MATCH .value
834939
835- # Case where the match is a false positive
836- elif is_false_positive (license_matches ):
837- return DetectionCategory .FALSE_POSITVE .value
838-
839940 # Cases where Match Coverage is a perfect 100 for all matches
840941 else :
841942 return DetectionCategory .PERFECT_DETECTION .value
@@ -1021,6 +1122,7 @@ def detect_licenses(
10211122 min_score = 0 ,
10221123 deadline = sys .maxsize ,
10231124 as_expression = False ,
1125+ package_license = False ,
10241126 ** kwargs
10251127):
10261128 """
@@ -1063,5 +1165,6 @@ def detect_licenses(
10631165 yield LicenseDetection .from_matches (
10641166 matches = group_of_matches ,
10651167 analysis = analysis ,
1066- post_scan = post_scan
1168+ post_scan = post_scan ,
1169+ package_license = package_license ,
10671170 )
0 commit comments