|
1 | 1 | #!/usr/bin/python3 |
2 | 2 | # generate debian/copyright from debian/copyright.template and node_modules |
3 | | -# Author: Martin Pitt <mpitt@debian.org> |
4 | | -# Allison Karlitskaya <allison.karlitskaya@redhat.com> |
| 3 | +# |
| 4 | +# Copyright (C) 2025 Red Hat, Inc. |
| 5 | +# SPDX-License-Identifier: LGPL-2.1-or-later |
5 | 6 |
|
6 | | -import argparse |
7 | | -import gzip |
8 | | -import os |
| 7 | +import json |
9 | 8 | import re |
10 | 9 | import sys |
11 | | -import time |
12 | | -from typing import Dict, Set |
13 | | - |
14 | | -BASE_DIR = os.path.realpath(f'{__file__}/../..') |
15 | | -TEMPLATE_FILE = f'{BASE_DIR}/tools/debian/copyright.template' |
16 | | - |
17 | | - |
18 | | -own_copyright = f"Copyright (C) 2013 - {time.strftime('%Y')} Red Hat, Inc." |
19 | | - |
20 | | -license_patterns = { |
21 | | - # Common patterns |
22 | | - r'\bMIT\b': ['MIT'], |
23 | | - |
24 | | - # https://github.com/focus-trap/focus-trap/blob/master/LICENSE |
25 | | - r'\bfocus-trap\b': ['MIT'], |
26 | | -} |
| 10 | +import tarfile |
| 11 | +from pathlib import Path |
| 12 | +from typing import Any |
27 | 13 |
|
28 | | -copyright_patterns = { |
29 | | - # Common patterns |
30 | | - r'Copyright (.*)$': [r'\1'], |
31 | | - r'\(c\) (.*)$': [r'\1'], |
32 | 14 |
|
33 | | - # https://github.com/focus-trap/focus-trap/blob/master/LICENSE |
34 | | - r'\bfocus-trap\b': ['2015-2016 David Clark'], |
35 | | -} |
36 | | - |
37 | | -used_patterns = set() |
38 | | - |
39 | | - |
40 | | -def parse_args(): |
41 | | - p = argparse.ArgumentParser(description='Generate debian/copyright file from template and node_modules') |
42 | | - return p.parse_args() |
43 | | - |
44 | | - |
45 | | -def template_licenses(template): |
| 15 | +def template_licenses(template: str) -> set[str]: |
46 | 16 | """Return set of existing License: short names""" |
| 17 | + return { |
| 18 | + line.split(None, 1)[1].lower() |
| 19 | + for line in template.splitlines() |
| 20 | + if line.startswith('License:') |
| 21 | + } |
| 22 | + |
| 23 | + |
| 24 | +# Patterns for skipping invalid copyright statements |
| 25 | +skip_patterns = [ |
| 26 | + # Generic license template text |
| 27 | + r'^owner', r'^holder', r'^license', r'^notice', r'^statement', r'^law', r'^and', r'^or\b', r'^the\b', |
| 28 | + # Template year placeholders |
| 29 | + r'\[yyyy\]', r'\{yyyy\}', |
| 30 | + # Just a year or just numbers/punctuation |
| 31 | + r'^\d{4}\s*$', r'^[\d\s,;.\-]+$', |
| 32 | + # Incomplete copyright statements |
| 33 | + r'^[\d\s,;.\-]+(All Rights|Reserved)\.?$', |
| 34 | +] |
| 35 | + |
| 36 | + |
| 37 | +def find_copyright_in_license_text(content: str) -> set[str]: |
| 38 | + """Heuristically extract copyrights from LICENSE file content""" |
| 39 | + def is_valid_copyright(text: str) -> bool: |
| 40 | + """Check if copyright text is valid""" |
| 41 | + if any(re.search(pattern, text, re.IGNORECASE) for pattern in skip_patterns): |
| 42 | + return False |
| 43 | + # Only accept if it looks like an actual copyright (has year or name) |
| 44 | + return (re.search(r'\d{4}', text) or len(text.split()) >= 2) and len(text) < 200 |
| 45 | + |
| 46 | + return { |
| 47 | + match.group(1).strip() |
| 48 | + for match in re.finditer(r'Copyright\s+(?:\(c\)\s*)?(.+)$', content, re.MULTILINE | re.IGNORECASE) |
| 49 | + if is_valid_copyright(match.group(1).strip()) |
| 50 | + } |
| 51 | + |
| 52 | + |
| 53 | +def normalize_spdx_license(license_text: str, license_ids: set[str]) -> str: |
| 54 | + """Normalize license to SPDX identifier using pattern-based substitutions""" |
| 55 | + # Handle license expressions - normalize to lowercase "and" per Debian spec |
| 56 | + # https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/#license-short-name |
| 57 | + license_text = license_text.replace(' AND ', ' and ') |
| 58 | + |
| 59 | + def normalize_part(part: str) -> str: |
| 60 | + """Normalize a single license identifier""" |
| 61 | + part = part.strip().strip('()') |
| 62 | + # Strip " License" suffix (e.g., "ISC License" -> "ISC") |
| 63 | + part = re.sub(r'\s+License$', '', part) |
| 64 | + # Convert e.g. "GPL-2.0+" → "GPL-2.0-or-later" |
| 65 | + part = re.sub(r'\+$', '-or-later', part) |
| 66 | + # Normalize specific licenses |
| 67 | + part = part.replace('MIT/X11', 'MIT') |
| 68 | + part = re.sub(r'^Apache[- ]2(?:\.0)?$', 'Apache-2.0', part) |
| 69 | + part = re.sub(r'^BSD$', 'BSD-3-Clause', part) |
| 70 | + part = re.sub(r'^Python-2\.0\.1$', 'Python-2.0', part) |
| 71 | + # Handle -only suffix (e.g., "LGPL-2.1" -> "LGPL-2.1-only") |
| 72 | + if re.match(r'^(L?GPL)-\d+\.\d+$', part): |
| 73 | + part += '-only' |
| 74 | + return part |
| 75 | + |
| 76 | + # Split "and"ed licenses |
| 77 | + parts = [p.strip() for p in license_text.split(' and ')] |
| 78 | + |
| 79 | + normalized_parts: list[str] = [] |
| 80 | + for part in parts: |
| 81 | + normalized = normalize_part(part) |
| 82 | + if normalized.lower() not in license_ids: |
| 83 | + sys.exit(f"ERROR: License '{normalized}' is not defined in the template\n" |
| 84 | + f"Original license text: '{license_text}'\n" |
| 85 | + "Please add a License: paragraph for this license to the template.") |
| 86 | + normalized_parts.append(normalized) |
| 87 | + |
| 88 | + return ' and '.join(normalized_parts) |
| 89 | + |
| 90 | + |
| 91 | +def extract_author_name(author: str | dict[str, Any]) -> str: |
| 92 | + """Extract author name from package.json author field""" |
| 93 | + if isinstance(author, dict): |
| 94 | + return str(author.get('name', '')) |
| 95 | + # Parse "Name <email>" format |
| 96 | + return re.sub(r'\s*<[^>]+>\s*', '', str(author)).strip() |
| 97 | + |
| 98 | + |
| 99 | +def get_legalese(tarball_path: Path, license_ids: set[str]) -> dict[str, tuple[str, set[str]]]: |
| 100 | + """Extract licenses and copyrights from node_modules tarball. |
| 101 | +
|
| 102 | + Returns: {toplevel -> (license, copyrights)} |
| 103 | + """ |
| 104 | + licenses: dict[str, set[str]] = {} # {toplevel -> set of license_texts} |
| 105 | + copyrights: dict[str, set[str]] = {} # {toplevel -> copyrights} |
| 106 | + |
| 107 | + license_filenames = {'LICENSE', 'LICENSE.md', 'LICENSE.txt', 'COPYING', 'COPYING.txt'} |
| 108 | + |
| 109 | + with tarfile.open(tarball_path) as tar: |
| 110 | + for member in tar.getmembers(): |
| 111 | + if not member.isfile() or not member.name.startswith('node_modules/'): |
| 112 | + continue |
47 | 113 |
|
48 | | - ids = set() |
49 | | - for line in template.splitlines(): |
50 | | - if line.startswith('License:'): |
51 | | - ids.add(line.split(None, 1)[1].lower()) |
52 | | - return ids |
53 | | - |
54 | | - |
55 | | -def find_patterns(patterns, text): |
56 | | - results = set() |
57 | | - |
58 | | - for pattern, templates in patterns.items(): |
59 | | - for match in re.finditer(pattern, text, re.MULTILINE): |
60 | | - used_patterns.add(pattern) |
61 | | - results.update(match.expand(template) for template in templates) |
| 114 | + # toplevel package name (second component after node_modules/ prefix) |
| 115 | + toplevel = member.name.split('/')[1] |
| 116 | + basename = Path(member.name).name |
| 117 | + |
| 118 | + if basename == 'package.json': |
| 119 | + # Parse package.json and extract license and author |
| 120 | + f = tar.extractfile(member) |
| 121 | + assert f |
| 122 | + pkg_data = json.load(f) |
| 123 | + if pkg_license := pkg_data.get('license'): |
| 124 | + licenses.setdefault(toplevel, set()).add(pkg_license) |
| 125 | + if author := pkg_data.get('author'): |
| 126 | + author_name = extract_author_name(author) |
| 127 | + if author_name: |
| 128 | + copyrights.setdefault(toplevel, set()).add(author_name) |
| 129 | + elif basename in license_filenames: |
| 130 | + # Process license file and extract copyrights directly |
| 131 | + f = tar.extractfile(member) |
| 132 | + assert f |
| 133 | + content = f.read().decode() |
| 134 | + copyrights.setdefault(toplevel, set()).update(find_copyright_in_license_text(content)) |
| 135 | + |
| 136 | + # Build package legal info, merging licenses with " and " |
| 137 | + packages_legal: dict[str, tuple[str, set[str]]] = {} |
| 138 | + for toplevel in licenses: |
| 139 | + # Normalize and merge licenses |
| 140 | + normalized = sorted({normalize_spdx_license(lic, license_ids) for lic in licenses[toplevel]}) |
| 141 | + license_text = ' and '.join(normalized) |
| 142 | + # Last-resort fallback if no copyright found |
| 143 | + packages_legal[toplevel] = (license_text, copyrights.get(toplevel, {f"Authors of {toplevel}"})) |
| 144 | + |
| 145 | + return packages_legal |
62 | 146 |
|
63 | | - return results |
64 | 147 |
|
65 | 148 | # |
66 | 149 | # main |
67 | 150 | # |
68 | 151 |
|
| 152 | +if len(sys.argv) != 3: |
| 153 | + sys.exit(f"Usage: {sys.argv[0]} <copyright-template> <node-cache-tarball>") |
69 | 154 |
|
70 | | -args = parse_args() |
71 | | - |
72 | | -with open(TEMPLATE_FILE, encoding='UTF-8') as f: |
73 | | - template = f.read() |
| 155 | +template_file = Path(sys.argv[1]) |
| 156 | +node_cache_path = Path(sys.argv[2]) |
74 | 157 |
|
| 158 | +template = template_file.read_text() |
75 | 159 | license_ids = template_licenses(template) |
76 | | - |
77 | | -# scan dist/ bundles for third-party copyrights and licenses |
78 | | - |
79 | | -dist_copyrights: Dict[str, Set[str]] = {} # Files: dirglob → set(copyrights) |
80 | | -dist_licenses: Dict[str, Set[str]] = {} # Files: dirglob → set(licenses) |
81 | | - |
82 | | -for directory, _subdirs, files in os.walk(f'{BASE_DIR}/dist'): |
83 | | - for file in files: |
84 | | - if '.LEGAL.txt' not in file: |
85 | | - continue |
86 | | - |
87 | | - full_filename = os.path.join(directory, file) |
88 | | - directory_glob = os.path.relpath(directory, start=BASE_DIR) + '/*' |
89 | | - |
90 | | - if file.endswith('.gz'): |
91 | | - with gzip.open(full_filename, 'rt') as license_file_gz: |
92 | | - contents = license_file_gz.read() |
93 | | - else: |
94 | | - with open(full_filename, 'rt') as license_file: |
95 | | - contents = license_file.read() |
96 | | - |
97 | | - for comment in contents.split('\n\n'): |
98 | | - if (comment.strip() == "" or "Bundled license information:" in comment): |
99 | | - continue |
100 | | - |
101 | | - licenses = find_patterns(license_patterns, comment) |
102 | | - if not licenses: |
103 | | - raise SystemError('Can not determine licenses of:\n%s' % comment) |
104 | | - for license_id in licenses: |
105 | | - if license_id.lower() not in license_ids: |
106 | | - raise KeyError(f'License {license_id} not found in {TEMPLATE_FILE}') |
107 | | - |
108 | | - # All bundles also contain our own code |
109 | | - licenses.add("LGPL-2.1-or-later") |
110 | | - |
111 | | - dist_licenses.setdefault(directory_glob, set()).update(licenses) |
112 | | - |
113 | | - copyrights = find_patterns(copyright_patterns, comment) |
114 | | - if not copyrights: |
115 | | - raise SystemError('Did not find any copyrights in:\n%s' % comment) |
116 | | - |
117 | | - # All bundles also contain our own code |
118 | | - copyrights.add(own_copyright) |
119 | | - |
120 | | - dist_copyrights.setdefault(directory_glob, set()).update(copyrights) |
121 | | - |
122 | | -for pattern in set.union(set(license_patterns), set(copyright_patterns)): |
123 | | - if pattern not in used_patterns: |
124 | | - # We'll have no LEGAL.txt files in that dev builds |
125 | | - # so of course we won't use any of the patterns |
126 | | - if os.getenv('NODE_ENV') == 'development' or os.getenv('IGNORE_UNUSED_PATTERNS'): |
127 | | - continue |
128 | | - |
129 | | - sys.exit(f'build-debian-copyright: Unused pattern: {pattern}') |
130 | | - |
131 | | -paragraphs = [] |
132 | | -for dirglob in sorted(dist_copyrights): |
133 | | - paragraphs.append("Files: {0}\nCopyright: {1}\nLicense: {2}".format( |
134 | | - dirglob, |
135 | | - '\n '.join(sorted(dist_copyrights[dirglob])), |
136 | | - ' and '.join(sorted(dist_licenses[dirglob])))) |
137 | | - |
138 | | -# force UTF-8 output, even when running in C locale |
139 | | -for line in template.splitlines(): |
140 | | - if '#NPM' in line: |
141 | | - sys.stdout.buffer.write('\n\n'.join(paragraphs).encode()) |
142 | | - else: |
143 | | - sys.stdout.buffer.write(line.encode()) |
144 | | - sys.stdout.buffer.write(b'\n') |
| 160 | +packages_legal = get_legalese(node_cache_path, license_ids) |
| 161 | + |
| 162 | +# Generate paragraphs |
| 163 | +paragraphs: list[str] = [] |
| 164 | +for toplevel in sorted(packages_legal.keys()): |
| 165 | + license_text, copyrights = packages_legal[toplevel] |
| 166 | + copyright_text = '\n '.join(sorted(copyrights)) |
| 167 | + paragraphs.append(f"Files: node/{toplevel}/*\nCopyright: {copyright_text}\nLicense: {license_text}") |
| 168 | + |
| 169 | +# Assemble copyright file |
| 170 | +npm_content = '\n\n'.join(paragraphs) |
| 171 | +print('\n'.join(npm_content if '#NPM' in line else line for line in template.splitlines())) |
0 commit comments