Skip to content

Commit 5afa8f2

Browse files
committed
Get origin info from multiple package data
* Combine all detected origin info from multiple package data and use the resulting values in the summary * Create new test for multiple package data summarization Reference: #2842 Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent 20cd6c0 commit 5afa8f2

File tree

9 files changed

+1113
-51
lines changed

9 files changed

+1113
-51
lines changed

src/summarycode/summarizer.py

Lines changed: 81 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
import fingerprints
1414
from commoncode.cliutils import POST_SCAN_GROUP, PluggableCommandLineOption
1515
from plugincode.post_scan import PostScanPlugin, post_scan_impl
16-
16+
from license_expression import Licensing
1717
from cluecode.copyrights import CopyrightDetector
18+
from packagedcode.utils import combine_expressions
1819
from summarycode.copyright_summary import canonical_holder
1920
from summarycode.score import (compute_license_score,
2021
get_field_values_from_codebase_resources,
@@ -91,7 +92,7 @@ def process_codebase(self, codebase, summary, **kwargs):
9192
]
9293

9394
# Determine declared license expression, declared holder, and primary language from Package data
94-
declared_license_expression, declared_holder, primary_language = get_origin_info_from_package_data(key_file_package_data, programming_language_summary)
95+
declared_license_expression, declared_holder, primary_language = get_origin_info_from_package_data(key_file_package_data)
9596

9697
if declared_license_expression:
9798
scoring_elements, _ = compute_license_score(codebase)
@@ -340,18 +341,18 @@ def get_declared_holder(codebase, holders_summary):
340341
for entry in holders_summary if entry['value']
341342
}
342343
key_file_holders = get_field_values_from_codebase_resources(codebase, 'holders', key_files_only=True)
343-
key_file_holders = [
344-
fingerprints.generate(entry['holder'])
345-
for entry in key_file_holders
346-
]
347-
unique_key_file_holders = unique(key_file_holders)
344+
entry_by_key_file_holders = {
345+
fingerprints.generate(entry['holder']): entry
346+
for entry in key_file_holders if entry['holder']
347+
}
348+
unique_key_file_holders = unique(entry_by_key_file_holders.keys())
349+
unique_key_file_holders_entries = [entry_by_holders[holder] for holder in unique_key_file_holders]
348350

349351
holder_by_counts = defaultdict(list)
350-
for holder in unique_key_file_holders:
351-
entry = entry_by_holders.get(holder) or {}
352-
count = entry.get('count')
352+
for holder_entry in unique_key_file_holders_entries:
353+
count = holder_entry.get('count')
353354
if count:
354-
holder = entry.get('value')
355+
holder = holder_entry.get('value')
355356
holder_by_counts[count].append(holder)
356357

357358
declared_holder = ''
@@ -381,56 +382,64 @@ def get_primary_language(programming_language_summary):
381382
return primary_language
382383

383384

384-
def get_origin_info_from_package_data(key_file_package_data, programming_language_summary):
385+
def get_origin_info_from_package_data(key_file_package_data):
385386
"""
386387
Return a 3-tuple containing the strings of declared license expression,
387388
copyright holder, and primary programming language from a list of detected
388389
package data.
389390
"""
390-
counts_by_programming_languages = {
391-
entry['value']: entry['count']
392-
for entry in programming_language_summary
393-
}
394-
packages_by_primary_languages = {
395-
package['primary_language']: package
396-
for package in key_file_package_data if package['primary_language']
397-
}
398-
399-
# We pick the package data to report as the origin information based on the
400-
# primary language of the packages
401-
# We will use the package whose primary language occurs most often in our codebase
402-
highest_count = 0
403-
top_package = None
404-
for package_primary_language, package in packages_by_primary_languages.items():
405-
count = counts_by_programming_languages.get(package_primary_language) or 0
406-
if count > highest_count:
407-
highest_count = count
408-
top_package = package
409-
410-
if not top_package:
391+
if not key_file_package_data:
411392
return '', '', ''
412393

413-
package = top_package
394+
if len(key_file_package_data) > 1:
395+
license_expressions = []
396+
programming_languages = []
397+
copyrights = []
398+
parties = []
399+
for package_data in key_file_package_data:
400+
license_expression = package_data.get('license_expression') or ''
401+
programming_language = package_data.get('primary_language') or ''
402+
copyright_statement = package_data.get('copyright') or ''
403+
package_parties = package_data.get('parties', [])
404+
license_expressions.append(license_expression)
405+
programming_languages.append(programming_language)
406+
copyrights.append(copyright_statement)
407+
parties.extend(package_parties)
408+
409+
# Combine license expressions
410+
unique_license_expressions = unique(license_expressions)
411+
combined_declared_license_expression = combine_expressions(unique_license_expressions)
412+
declared_license_expression = ''
413+
if combined_declared_license_expression:
414+
declared_license_expression = str(Licensing().parse(combined_declared_license_expression).simplify())
415+
416+
# Combine holders
417+
holders = list(get_holders_from_copyright(copyrights))
418+
declared_holder = ''
419+
if holders:
420+
declared_holder = ', '.join(holders)
421+
elif parties:
422+
party_members = [party['name'] for party in parties]
423+
declared_holder = ', '.join(party_members)
424+
425+
# Programming language
426+
unique_programming_languages = unique(programming_languages)
427+
primary_language = ''
428+
if len(unique_programming_languages) == 1:
429+
primary_language = unique_programming_languages[0]
430+
431+
return declared_license_expression, declared_holder, primary_language
432+
433+
package = key_file_package_data[0]
414434
declared_license_expression = package.get('license_expression') or ''
415435
package_primary_language = package.get('primary_language') or ''
416436

417437
# Determine declared holder from Package copyright statement
418438
package_copyright = package.get('copyright', '')
419439
package_holders = []
420440
if package_copyright:
421-
numbered_lines = [(0, package_copyright)]
422-
423-
holder_detections = CopyrightDetector().detect(
424-
numbered_lines,
425-
include_copyrights=False,
426-
include_holders=True,
427-
include_authors=False,
428-
)
429-
430-
for holder_detection in holder_detections:
431-
package_holders.append(holder_detection.holder)
441+
package_holders = list(get_holders_from_copyright(package_copyright))
432442

433-
declared_holder = ''
434443
if package_holders:
435444
declared_holder = ', '.join(package_holders)
436445
else:
@@ -442,3 +451,29 @@ def get_origin_info_from_package_data(key_file_package_data, programming_languag
442451
declared_holder = ', '.join(party_members)
443452

444453
return declared_license_expression, declared_holder, package_primary_language
454+
455+
456+
def get_holders_from_copyright(copyright):
457+
"""
458+
Yield holders detected from a `copyright` string or list.
459+
"""
460+
numbered_lines = []
461+
if isinstance(copyright, list):
462+
for i, c in enumerate(copyright):
463+
numbered_lines.append(
464+
(i, c)
465+
)
466+
else:
467+
numbered_lines.append(
468+
(0, copyright)
469+
)
470+
471+
holder_detections = CopyrightDetector().detect(
472+
numbered_lines,
473+
include_copyrights=False,
474+
include_holders=True,
475+
include_authors=False,
476+
)
477+
478+
for holder_detection in holder_detections:
479+
yield holder_detection.holder

0 commit comments

Comments
 (0)