Skip to content

Commit 37f62b0

Browse files
martinpittjelly
authored andcommitted
debian: Auto-generate debian/copyright
Now that we ship the node modules tarball as part of the source, we need to document its copyrights and licenses. Update tools/build-debian-copyright to the version in cockpit-{files,podman} (we can drop their copies and import from cockpit afterwards), and adjust the copyright template accordingly: add missing licenses and update the comment about `dist/`.
1 parent 009e792 commit 37f62b0

File tree

3 files changed

+224
-135
lines changed

3 files changed

+224
-135
lines changed

Makefile.am

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,15 @@ $(NODE_CACHE): $(srcdir)/package-lock.json
6969
$(AM_V_GEN) $(srcdir)/tools/node-modules runtime-tar $(CURDIR)/$(NODE_CACHE)
7070

7171
dist-hook: $(distdir)/tools/debian/copyright
72-
# wildcard gymnastics for distcheck: no node_modules/ in distdir
73-
$(distdir)/tools/debian/copyright: $(DIST_STAMP) $(if $(wildcard $(srcdir)/node_modules/.package-lock.json),$(NODE_CACHE))
74-
$(AM_V_GEN) NODE_ENV=$(NODE_ENV) $(srcdir)/tools/build-debian-copyright > $@
72+
# when building from a git checkout, we need to generate the copyright file
73+
# when building from tarball (also separate build tree, dist-git), we can't
74+
# rely on node_modules, so just copy the existing file
75+
$(distdir)/tools/debian/copyright: $(srcdir)/tools/debian/copyright.template $(DIST_STAMP) $(if $(wildcard $(srcdir)/node_modules/.package-lock.json),$(NODE_CACHE))
76+
@if [ -e "$(srcdir)/node_modules/.package-lock.json" ]; then \
77+
$(AM_V_GEN) $(srcdir)/tools/build-debian-copyright $< $(NODE_CACHE) > $@; \
78+
else \
79+
$(AM_V_GEN) cp "$(srcdir)/tools/debian/copyright" $@; \
80+
fi
7581

7682
DISTCHECK_CONFIGURE_FLAGS = --enable-prefix-only $(EXTRA_DISTCHECK_CONFIGURE_FLAGS)
7783

tools/build-debian-copyright

Lines changed: 153 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1,144 +1,171 @@
11
#!/usr/bin/python3
22
# generate debian/copyright from debian/copyright.template and node_modules
3-
# Author: Martin Pitt <mpitt@debian.org>
4-
# Allison Karlitskaya <allison.karlitskaya@redhat.com>
3+
#
4+
# Copyright (C) 2025 Red Hat, Inc.
5+
# SPDX-License-Identifier: LGPL-2.1-or-later
56

6-
import argparse
7-
import gzip
8-
import os
7+
import json
98
import re
109
import sys
11-
import time
12-
from typing import Dict, Set
13-
14-
BASE_DIR = os.path.realpath(f'{__file__}/../..')
15-
TEMPLATE_FILE = f'{BASE_DIR}/tools/debian/copyright.template'
16-
17-
18-
own_copyright = f"Copyright (C) 2013 - {time.strftime('%Y')} Red Hat, Inc."
19-
20-
license_patterns = {
21-
# Common patterns
22-
r'\bMIT\b': ['MIT'],
23-
24-
# https://github.com/focus-trap/focus-trap/blob/master/LICENSE
25-
r'\bfocus-trap\b': ['MIT'],
26-
}
10+
import tarfile
11+
from pathlib import Path
12+
from typing import Any
2713

28-
copyright_patterns = {
29-
# Common patterns
30-
r'Copyright (.*)$': [r'\1'],
31-
r'\(c\) (.*)$': [r'\1'],
3214

33-
# https://github.com/focus-trap/focus-trap/blob/master/LICENSE
34-
r'\bfocus-trap\b': ['2015-2016 David Clark'],
35-
}
36-
37-
used_patterns = set()
38-
39-
40-
def parse_args():
41-
p = argparse.ArgumentParser(description='Generate debian/copyright file from template and node_modules')
42-
return p.parse_args()
43-
44-
45-
def template_licenses(template):
15+
def template_licenses(template: str) -> set[str]:
4616
"""Return set of existing License: short names"""
17+
return {
18+
line.split(None, 1)[1].lower()
19+
for line in template.splitlines()
20+
if line.startswith('License:')
21+
}
22+
23+
24+
# Patterns for skipping invalid copyright statements
25+
skip_patterns = [
26+
# Generic license template text
27+
r'^owner', r'^holder', r'^license', r'^notice', r'^statement', r'^law', r'^and', r'^or\b', r'^the\b',
28+
# Template year placeholders
29+
r'\[yyyy\]', r'\{yyyy\}',
30+
# Just a year or just numbers/punctuation
31+
r'^\d{4}\s*$', r'^[\d\s,;.\-]+$',
32+
# Incomplete copyright statements
33+
r'^[\d\s,;.\-]+(All Rights|Reserved)\.?$',
34+
]
35+
36+
37+
def find_copyright_in_license_text(content: str) -> set[str]:
38+
"""Heuristically extract copyrights from LICENSE file content"""
39+
def is_valid_copyright(text: str) -> bool:
40+
"""Check if copyright text is valid"""
41+
if any(re.search(pattern, text, re.IGNORECASE) for pattern in skip_patterns):
42+
return False
43+
# Only accept if it looks like an actual copyright (has year or name)
44+
return (re.search(r'\d{4}', text) or len(text.split()) >= 2) and len(text) < 200
45+
46+
return {
47+
match.group(1).strip()
48+
for match in re.finditer(r'Copyright\s+(?:\(c\)\s*)?(.+)$', content, re.MULTILINE | re.IGNORECASE)
49+
if is_valid_copyright(match.group(1).strip())
50+
}
51+
52+
53+
def normalize_spdx_license(license_text: str, license_ids: set[str]) -> str:
54+
"""Normalize license to SPDX identifier using pattern-based substitutions"""
55+
# Handle license expressions - normalize to lowercase "and" per Debian spec
56+
# https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/#license-short-name
57+
license_text = license_text.replace(' AND ', ' and ')
58+
59+
def normalize_part(part: str) -> str:
60+
"""Normalize a single license identifier"""
61+
part = part.strip().strip('()')
62+
# Strip " License" suffix (e.g., "ISC License" -> "ISC")
63+
part = re.sub(r'\s+License$', '', part)
64+
# Convert e.g. "GPL-2.0+" → "GPL-2.0-or-later"
65+
part = re.sub(r'\+$', '-or-later', part)
66+
# Normalize specific licenses
67+
part = part.replace('MIT/X11', 'MIT')
68+
part = re.sub(r'^Apache[- ]2(?:\.0)?$', 'Apache-2.0', part)
69+
part = re.sub(r'^BSD$', 'BSD-3-Clause', part)
70+
part = re.sub(r'^Python-2\.0\.1$', 'Python-2.0', part)
71+
# Handle -only suffix (e.g., "LGPL-2.1" -> "LGPL-2.1-only")
72+
if re.match(r'^(L?GPL)-\d+\.\d+$', part):
73+
part += '-only'
74+
return part
75+
76+
# Split "and"ed licenses
77+
parts = [p.strip() for p in license_text.split(' and ')]
78+
79+
normalized_parts: list[str] = []
80+
for part in parts:
81+
normalized = normalize_part(part)
82+
if normalized.lower() not in license_ids:
83+
sys.exit(f"ERROR: License '{normalized}' is not defined in the template\n"
84+
f"Original license text: '{license_text}'\n"
85+
"Please add a License: paragraph for this license to the template.")
86+
normalized_parts.append(normalized)
87+
88+
return ' and '.join(normalized_parts)
89+
90+
91+
def extract_author_name(author: str | dict[str, Any]) -> str:
92+
"""Extract author name from package.json author field"""
93+
if isinstance(author, dict):
94+
return str(author.get('name', ''))
95+
# Parse "Name <email>" format
96+
return re.sub(r'\s*<[^>]+>\s*', '', str(author)).strip()
97+
98+
99+
def get_legalese(tarball_path: Path, license_ids: set[str]) -> dict[str, tuple[str, set[str]]]:
100+
"""Extract licenses and copyrights from node_modules tarball.
101+
102+
Returns: {toplevel -> (license, copyrights)}
103+
"""
104+
licenses: dict[str, set[str]] = {} # {toplevel -> set of license_texts}
105+
copyrights: dict[str, set[str]] = {} # {toplevel -> copyrights}
106+
107+
license_filenames = {'LICENSE', 'LICENSE.md', 'LICENSE.txt', 'COPYING', 'COPYING.txt'}
108+
109+
with tarfile.open(tarball_path) as tar:
110+
for member in tar.getmembers():
111+
if not member.isfile() or not member.name.startswith('node_modules/'):
112+
continue
47113

48-
ids = set()
49-
for line in template.splitlines():
50-
if line.startswith('License:'):
51-
ids.add(line.split(None, 1)[1].lower())
52-
return ids
53-
54-
55-
def find_patterns(patterns, text):
56-
results = set()
57-
58-
for pattern, templates in patterns.items():
59-
for match in re.finditer(pattern, text, re.MULTILINE):
60-
used_patterns.add(pattern)
61-
results.update(match.expand(template) for template in templates)
114+
# toplevel package name (second component after node_modules/ prefix)
115+
toplevel = member.name.split('/')[1]
116+
basename = Path(member.name).name
117+
118+
if basename == 'package.json':
119+
# Parse package.json and extract license and author
120+
f = tar.extractfile(member)
121+
assert f
122+
pkg_data = json.load(f)
123+
if pkg_license := pkg_data.get('license'):
124+
licenses.setdefault(toplevel, set()).add(pkg_license)
125+
if author := pkg_data.get('author'):
126+
author_name = extract_author_name(author)
127+
if author_name:
128+
copyrights.setdefault(toplevel, set()).add(author_name)
129+
elif basename in license_filenames:
130+
# Process license file and extract copyrights directly
131+
f = tar.extractfile(member)
132+
assert f
133+
content = f.read().decode()
134+
copyrights.setdefault(toplevel, set()).update(find_copyright_in_license_text(content))
135+
136+
# Build package legal info, merging licenses with " and "
137+
packages_legal: dict[str, tuple[str, set[str]]] = {}
138+
for toplevel in licenses:
139+
# Normalize and merge licenses
140+
normalized = sorted({normalize_spdx_license(lic, license_ids) for lic in licenses[toplevel]})
141+
license_text = ' and '.join(normalized)
142+
# Last-resort fallback if no copyright found
143+
packages_legal[toplevel] = (license_text, copyrights.get(toplevel, {f"Authors of {toplevel}"}))
144+
145+
return packages_legal
62146

63-
return results
64147

65148
#
66149
# main
67150
#
68151

152+
if len(sys.argv) != 3:
153+
sys.exit(f"Usage: {sys.argv[0]} <copyright-template> <node-cache-tarball>")
69154

70-
args = parse_args()
71-
72-
with open(TEMPLATE_FILE, encoding='UTF-8') as f:
73-
template = f.read()
155+
template_file = Path(sys.argv[1])
156+
node_cache_path = Path(sys.argv[2])
74157

158+
template = template_file.read_text()
75159
license_ids = template_licenses(template)
76-
77-
# scan dist/ bundles for third-party copyrights and licenses
78-
79-
dist_copyrights: Dict[str, Set[str]] = {} # Files: dirglob → set(copyrights)
80-
dist_licenses: Dict[str, Set[str]] = {} # Files: dirglob → set(licenses)
81-
82-
for directory, _subdirs, files in os.walk(f'{BASE_DIR}/dist'):
83-
for file in files:
84-
if '.LEGAL.txt' not in file:
85-
continue
86-
87-
full_filename = os.path.join(directory, file)
88-
directory_glob = os.path.relpath(directory, start=BASE_DIR) + '/*'
89-
90-
if file.endswith('.gz'):
91-
with gzip.open(full_filename, 'rt') as license_file_gz:
92-
contents = license_file_gz.read()
93-
else:
94-
with open(full_filename, 'rt') as license_file:
95-
contents = license_file.read()
96-
97-
for comment in contents.split('\n\n'):
98-
if (comment.strip() == "" or "Bundled license information:" in comment):
99-
continue
100-
101-
licenses = find_patterns(license_patterns, comment)
102-
if not licenses:
103-
raise SystemError('Can not determine licenses of:\n%s' % comment)
104-
for license_id in licenses:
105-
if license_id.lower() not in license_ids:
106-
raise KeyError(f'License {license_id} not found in {TEMPLATE_FILE}')
107-
108-
# All bundles also contain our own code
109-
licenses.add("LGPL-2.1-or-later")
110-
111-
dist_licenses.setdefault(directory_glob, set()).update(licenses)
112-
113-
copyrights = find_patterns(copyright_patterns, comment)
114-
if not copyrights:
115-
raise SystemError('Did not find any copyrights in:\n%s' % comment)
116-
117-
# All bundles also contain our own code
118-
copyrights.add(own_copyright)
119-
120-
dist_copyrights.setdefault(directory_glob, set()).update(copyrights)
121-
122-
for pattern in set.union(set(license_patterns), set(copyright_patterns)):
123-
if pattern not in used_patterns:
124-
# We'll have no LEGAL.txt files in that dev builds
125-
# so of course we won't use any of the patterns
126-
if os.getenv('NODE_ENV') == 'development' or os.getenv('IGNORE_UNUSED_PATTERNS'):
127-
continue
128-
129-
sys.exit(f'build-debian-copyright: Unused pattern: {pattern}')
130-
131-
paragraphs = []
132-
for dirglob in sorted(dist_copyrights):
133-
paragraphs.append("Files: {0}\nCopyright: {1}\nLicense: {2}".format(
134-
dirglob,
135-
'\n '.join(sorted(dist_copyrights[dirglob])),
136-
' and '.join(sorted(dist_licenses[dirglob]))))
137-
138-
# force UTF-8 output, even when running in C locale
139-
for line in template.splitlines():
140-
if '#NPM' in line:
141-
sys.stdout.buffer.write('\n\n'.join(paragraphs).encode())
142-
else:
143-
sys.stdout.buffer.write(line.encode())
144-
sys.stdout.buffer.write(b'\n')
160+
packages_legal = get_legalese(node_cache_path, license_ids)
161+
162+
# Generate paragraphs
163+
paragraphs: list[str] = []
164+
for toplevel in sorted(packages_legal.keys()):
165+
license_text, copyrights = packages_legal[toplevel]
166+
copyright_text = '\n '.join(sorted(copyrights))
167+
paragraphs.append(f"Files: node/{toplevel}/*\nCopyright: {copyright_text}\nLicense: {license_text}")
168+
169+
# Assemble copyright file
170+
npm_content = '\n\n'.join(paragraphs)
171+
print('\n'.join(npm_content if '#NPM' in line else line for line in template.splitlines()))

tools/debian/copyright.template

Lines changed: 62 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@ Upstream-Name: cockpit
33
Source: https://github.com/cockpit-project/cockpit
44
Comment:
55
This does not directly cover the files in dist/*. These are "minified" and
6-
compressed JavaScript/HTML files built from pkg/* and node_modules/*. Their
7-
copyrights and licenses are described below. Rebuilding these requires
8-
internet access as that process needs to download additional npm modules from
9-
the Internet, thus upstream ships the pre-minified bundles as part of the
10-
upstream release tarball so that the package can be built without internet
11-
access and lots of extra unpackaged build dependencies.
6+
compressed JavaScript/HTML files built from the main source and node_modules/
7+
(from the -node orig tarball). Upstream releases ship a pre-built dist/
8+
bundle, but the Debian package removes and rebuilds it. All input sources are
9+
documented below (node_modules/* is auto-generated).
1210

1311
Files: *
1412
Copyright: 2013-2016 Red Hat, Inc.
@@ -115,3 +113,61 @@ License: MIT-IBM-immunity
115113
DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING
116114
OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE, EVEN
117115
IF IBM IS APPRISED OF THE POSSIBILITY OF SUCH DAMAGES.
116+
117+
License: Apache-2.0
118+
On Debian systems, the complete text of the Apache License version 2.0
119+
can be found in "/usr/share/common-licenses/Apache-2.0".
120+
121+
License: BSD-3-Clause
122+
Redistribution and use in source and binary forms, with or without
123+
modification, are permitted provided that the following conditions are met:
124+
.
125+
1. Redistributions of source code must retain the above copyright notice,
126+
this list of conditions and the following disclaimer.
127+
2. Redistributions in binary form must reproduce the above copyright
128+
notice, this list of conditions and the following disclaimer in the
129+
documentation and/or other materials provided with the distribution.
130+
3. Neither the name of the copyright holder nor the names of its
131+
contributors may be used to endorse or promote products derived from
132+
this software without specific prior written permission.
133+
.
134+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
135+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
136+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
137+
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
138+
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
139+
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
140+
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
141+
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
142+
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
143+
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
144+
POSSIBILITY OF SUCH DAMAGE.
145+
146+
License: ISC
147+
Permission to use, copy, modify, and/or distribute this software for any
148+
purpose with or without fee is hereby granted, provided that the above
149+
copyright notice and this permission notice appear in all copies.
150+
.
151+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
152+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
153+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
154+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
155+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
156+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
157+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
158+
159+
License: 0BSD
160+
Permission to use, copy, modify, and/or distribute this software for any
161+
purpose with or without fee is hereby granted.
162+
.
163+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
164+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
165+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
166+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
167+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
168+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
169+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
170+
171+
License: Python-2.0
172+
On Debian systems, the complete text of the Python License version 2.0
173+
can be found in "/usr/share/common-licenses/Python-2.0".

0 commit comments

Comments
 (0)