Skip to content

Commit ee4999c

Browse files
committed
Speed up tar packing by lowering compresslevel and creating symbolic links for same files
1 parent 447fb8e commit ee4999c

File tree

4 files changed

+149
-18
lines changed

4 files changed

+149
-18
lines changed

pkg/private/tar/build_tar.py

Lines changed: 56 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,8 @@ class TarFile(object):
4242
class DebError(Exception):
4343
pass
4444

45-
def __init__(self, output, directory, compression, compressor, create_parents, allow_dups_from_deps, default_mtime):
45+
def __init__(self, output, directory, compression, compressor, create_parents,
46+
allow_dups_from_deps, auto_deduplicate, default_mtime, compresslevel=None):
4647
# Directory prefix on all output paths
4748
d = directory.strip('/')
4849
self.directory = (d + '/') if d else None
@@ -52,6 +53,8 @@ def __init__(self, output, directory, compression, compressor, create_parents, a
5253
self.default_mtime = default_mtime
5354
self.create_parents = create_parents
5455
self.allow_dups_from_deps = allow_dups_from_deps
56+
self.compresslevel = compresslevel
57+
self.src_to_first_dest_map = {} if auto_deduplicate else None
5558

5659
def __enter__(self):
5760
self.tarfile = tar_writer.TarFileWriter(
@@ -60,7 +63,8 @@ def __enter__(self):
6063
self.compressor,
6164
self.create_parents,
6265
self.allow_dups_from_deps,
63-
default_mtime=self.default_mtime)
66+
default_mtime=self.default_mtime,
67+
compresslevel=self.compresslevel)
6468
return self
6569

6670
def __exit__(self, t, v, traceback):
@@ -98,6 +102,12 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
98102
copied to `self.directory/destfile` in the layer.
99103
"""
100104
dest = self.normalize_path(destfile)
105+
if self.src_to_first_dest_map is not None:
106+
normalized_src = normpath(f)
107+
relative_path_to_link_to = self.auto_deduplicate(normalized_src, dest)
108+
if relative_path_to_link_to:
109+
self.add_link(dest, relative_path_to_link_to, mode=mode, ids=ids, names=names)
110+
return
101111
# If mode is unspecified, derive the mode from the file's mode.
102112
if mode is None:
103113
mode = 0o755 if os.access(f, os.X_OK) else 0o644
@@ -114,6 +124,23 @@ def add_file(self, f, destfile, mode=None, ids=None, names=None):
114124
uname=names[0],
115125
gname=names[1])
116126

127+
def auto_deduplicate(self, src_file, dest_file):
128+
"""Detect whether to de-duplicate the destination file
129+
130+
Returns:
131+
The relative path to create a symlink to or None
132+
"""
133+
if self.src_to_first_dest_map is not None:
134+
first_dest = self.src_to_first_dest_map.get(src_file)
135+
if first_dest is None:
136+
real_src_file = os.path.realpath(src_file)
137+
first_dest = self.src_to_first_dest_map.setdefault(real_src_file, dest_file)
138+
self.src_to_first_dest_map[src_file] = first_dest
139+
if first_dest != dest_file:
140+
return os.path.relpath(first_dest, os.path.dirname(dest_file))
141+
return None
142+
143+
117144
def add_empty_file(self,
118145
destfile,
119146
mode=None,
@@ -269,13 +296,13 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
269296
for dir in dirs:
270297
to_write[dest_dir + dir] = None
271298
for file in sorted(files):
272-
content_path = os.path.abspath(os.path.join(root, file))
299+
content_path = os.path.join(root, file)
273300
if os.name == "nt":
274301
# "To specify an extended-length path, use the `\\?\` prefix. For
275302
# example, `\\?\D:\very long path`."[1]
276303
#
277304
# [1]: https://learn.microsoft.com/en-us/windows/win32/fileio/maximum-file-path-limitation
278-
to_write[dest_dir + file] = "\\\\?\\" + content_path
305+
to_write[dest_dir + file] = "\\\\?\\" + os.path.abspath(content_path)
279306
else:
280307
to_write[dest_dir + file] = content_path
281308

@@ -297,6 +324,10 @@ def add_tree(self, tree_top, destpath, mode=None, ids=None, names=None):
297324
f_mode = 0o755 if os.access(content_path, os.X_OK) else 0o644
298325
else:
299326
f_mode = mode
327+
relative_path_to_link_to = self.auto_deduplicate(content_path, dest)
328+
if relative_path_to_link_to:
329+
self.add_link(dest, relative_path_to_link_to, mode=f_mode, ids=ids, names=names)
330+
continue
300331
self.tarfile.add_file(
301332
path,
302333
file_content=content_path,
@@ -345,7 +376,7 @@ def main():
345376
fromfile_prefix_chars='@')
346377
parser.add_argument('--output', required=True,
347378
help='The output file, mandatory.')
348-
parser.add_argument('--manifest',
379+
parser.add_argument('--manifest', action='append',
349380
help='manifest of contents to add to the layer.')
350381
parser.add_argument('--mode',
351382
help='Force the mode on the added files (in octal).')
@@ -359,7 +390,7 @@ def main():
359390
parser.add_argument('--deb', action='append',
360391
help='A debian package to add to the layer')
361392
parser.add_argument(
362-
'--directory',
393+
'--directory', action='append',
363394
help='Directory in which to store the file inside the layer')
364395

365396
compression = parser.add_mutually_exclusive_group()
@@ -397,6 +428,12 @@ def main():
397428
parser.add_argument('--allow_dups_from_deps',
398429
action='store_true',
399430
help='')
431+
parser.add_argument('--auto_deduplicate',
432+
action='store_true',
433+
help='Auto create symlinks for files mapped from a same source in manifests.')
434+
parser.add_argument(
435+
'--compresslevel', default='',
436+
help='Specify the numeric compress level in gzip mode; may be 0-9 or empty(6).')
400437
options = parser.parse_args()
401438

402439
# Parse modes arguments
@@ -443,12 +480,14 @@ def main():
443480
# Add objects to the tar file
444481
with TarFile(
445482
options.output,
446-
directory = helpers.GetFlagValue(options.directory),
483+
directory = helpers.GetFlagValue(options.directory[0]),
447484
compression = options.compression,
448485
compressor = options.compressor,
449486
default_mtime=default_mtime,
450487
create_parents=options.create_parents,
451-
allow_dups_from_deps=options.allow_dups_from_deps) as output:
488+
allow_dups_from_deps=options.allow_dups_from_deps,
489+
auto_deduplicate=options.auto_deduplicate,
490+
compresslevel = options.compresslevel) as output:
452491

453492
def file_attributes(filename):
454493
if filename.startswith('/'):
@@ -459,12 +498,19 @@ def file_attributes(filename):
459498
'names': names_map.get(filename, default_ownername),
460499
}
461500

462-
if options.manifest:
463-
with open(options.manifest, 'r') as manifest_fp:
501+
normalized_first_directory = output.directory
502+
manifest_list = zip(options.directory, options.manifest)
503+
if options.auto_deduplicate:
504+
manifest_list = list(manifest_list)[::-1]
505+
for directory, manifest_path in manifest_list:
506+
directory = helpers.GetFlagValue(directory)
507+
output.directory = (directory.strip('/') + '/') if directory.strip('/') else None
508+
with open(manifest_path, 'r') as manifest_fp:
464509
manifest_entries = manifest.read_entries_from(manifest_fp)
465510
for entry in manifest_entries:
466511
output.add_manifest_entry(entry, file_attributes)
467512

513+
output.directory = normalized_first_directory
468514
for tar in options.tar or []:
469515
output.add_tar(tar)
470516
for deb in options.deb or []:

pkg/private/tar/tar.bzl

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,13 @@ SUPPORTED_TAR_COMPRESSIONS = (
4040
_DEFAULT_MTIME = -1
4141
_stamp_condition = Label("//pkg/private:private_stamp_detect")
4242

43+
MappingManifestInfo = provider(
44+
"Path mapping to pack files",
45+
fields = {
46+
"manifest": "a list of (expanded package_dir, manfiest files, deps).",
47+
},
48+
)
49+
4350
def _remap(remap_paths, path):
4451
"""If path starts with a key in remap_paths, rewrite it."""
4552
for prefix, replacement in remap_paths.items():
@@ -69,8 +76,11 @@ def _pkg_tar_impl(ctx):
6976
if ctx.attr.package_dir_file:
7077
if ctx.attr.package_dir:
7178
fail("Both package_dir and package_dir_file attributes were specified")
79+
if ctx.attr.merge_mappings:
80+
fail("Can not merge tarball mappings when package_dir_file is specified")
7281
args.add("--directory", "@" + ctx.file.package_dir_file.path)
7382
files.append(ctx.file.package_dir_file)
83+
package_dir_expanded = None
7484
else:
7585
package_dir_expanded = substitute_package_variables(ctx, ctx.attr.package_dir)
7686
args.add("--directory", package_dir_expanded or "/")
@@ -114,6 +124,10 @@ def _pkg_tar_impl(ctx):
114124
"--owner_names",
115125
"%s=%s" % (_quote(key), ctx.attr.ownernames[key]),
116126
)
127+
if ctx.attr.compresslevel:
128+
args.add("--compresslevel", ctx.attr.compresslevel)
129+
if ctx.attr.auto_deduplicate:
130+
args.add("--auto_deduplicate")
117131

118132
# Now we begin processing the files.
119133
path_mapper = None
@@ -151,8 +165,6 @@ def _pkg_tar_impl(ctx):
151165
add_empty_file(mapping_context, empty_file, ctx.label)
152166
for empty_dir in ctx.attr.empty_dirs or []:
153167
add_directory(mapping_context, empty_dir, ctx.label)
154-
for f in ctx.files.deps:
155-
args.add("--tar", f.path)
156168
for link in ctx.attr.symlinks:
157169
add_symlink(
158170
mapping_context,
@@ -170,6 +182,29 @@ def _pkg_tar_impl(ctx):
170182
write_manifest(ctx, manifest_file, mapping_context.content_map)
171183
args.add("--manifest", manifest_file.path)
172184

185+
does_merge_mappings = ctx.attr.merge_mappings
186+
new_dir_prefix = package_dir_expanded + "/" if package_dir_expanded else ""
187+
manifest_list = [(package_dir_expanded, manifest_file, mapping_context.file_deps)]
188+
file_dep_set = {}
189+
for dep_i in ctx.attr.deps:
190+
if does_merge_mappings and (MappingManifestInfo in dep_i):
191+
for i_dir, i_manifest, i_file_deps in dep_i[MappingManifestInfo].manifest:
192+
i_dir = new_dir_prefix + (i_dir or "")
193+
args.add("--directory", i_dir)
194+
args.add("--manifest", i_manifest.path)
195+
files.append(i_manifest)
196+
for i in i_file_deps:
197+
file_dep_set[i] = 1
198+
manifest_list.append((i_dir, i_manifest, i_file_deps))
199+
else:
200+
for dep_file in dep_i.files.to_list():
201+
if does_merge_mappings and dep_file.path.startswith("bazel-out/"):
202+
fail("Please avoid depending on generated .tar directly: " + dep_file.path)
203+
args.add("--tar", dep_file.path)
204+
files += dep_i.files.to_list()
205+
for i in mapping_context.file_deps:
206+
file_dep_set[i] = 1
207+
173208
args.set_param_file_format("flag_per_line")
174209
args.use_param_file("@%s", use_always = False)
175210

@@ -180,8 +215,8 @@ def _pkg_tar_impl(ctx):
180215
args.add("--allow_dups_from_deps")
181216

182217
inputs = depset(
183-
direct = ctx.files.deps + files,
184-
transitive = mapping_context.file_deps,
218+
direct = files,
219+
transitive = list(file_dep_set.keys()),
185220
)
186221

187222
ctx.actions.run(
@@ -212,7 +247,11 @@ def _pkg_tar_impl(ctx):
212247
OutputGroupInfo(
213248
manifest = [manifest_file],
214249
),
215-
]
250+
] + ([
251+
MappingManifestInfo(
252+
manifest = manifest_list,
253+
),
254+
] if does_merge_mappings else [])
216255

217256
# A rule for creating a tar file, see README.md
218257
pkg_tar_impl = rule(
@@ -256,6 +295,14 @@ pkg_tar_impl = rule(
256295
"extension": attr.string(default = "tar"),
257296
"symlinks": attr.string_dict(),
258297
"empty_files": attr.string_list(),
298+
"merge_mappings": attr.bool(
299+
doc = """Repack tar files in `deps` by re-applying their manifest files.""",
300+
default = False,
301+
),
302+
"auto_deduplicate": attr.bool(
303+
doc = """Auto create symlinks for files mapped from a same source in manifests.""",
304+
default = False,
305+
),
259306
"include_runfiles": attr.bool(
260307
doc = ("""Include runfiles for executables. These appear as they would in bazel-bin."""
261308
+ """For example: 'path/to/myprog.runfiles/path/to/my_data.txt'."""),
@@ -272,6 +319,10 @@ pkg_tar_impl = rule(
272319
),
273320
"create_parents": attr.bool(default = True),
274321
"allow_duplicates_from_deps": attr.bool(default = False),
322+
"compresslevel": attr.string(
323+
doc = """Specify the numeric compress level in gzip mode; may be 0-9 or empty (6).""",
324+
default = "",
325+
),
275326

276327
# Common attributes
277328
"out": attr.output(mandatory = True),
@@ -342,3 +393,34 @@ def pkg_tar(name, **kwargs):
342393
}),
343394
**kwargs
344395
)
396+
397+
def _pkg_tar_group_impl(ctx):
398+
manifest_list = []
399+
output_files = []
400+
for i in ctx.attr.srcs:
401+
if MappingManifestInfo in i:
402+
manifest_list += i[MappingManifestInfo].manifest
403+
output_files += i.files.to_list()
404+
if manifest_list and len(manifest_list) < len(output_files):
405+
fail("Can not merge generated tar files and source ones; please split into different groups.")
406+
return [
407+
DefaultInfo(
408+
files = depset(output_files),
409+
),
410+
MappingManifestInfo(
411+
manifest = manifest_list,
412+
),
413+
]
414+
415+
pkg_tar_group = rule(
416+
doc = """Expose a group of source tar files.""",
417+
implementation = _pkg_tar_group_impl,
418+
attrs = {
419+
"srcs": attr.label_list(
420+
doc = """Tar files generated by pkg_tar().""",
421+
mandatory = True,
422+
allow_files = tar_filetype,
423+
),
424+
},
425+
provides = [MappingManifestInfo],
426+
)

pkg/private/tar/tar_writer.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,8 @@ def __init__(self,
4949
create_parents=False,
5050
allow_dups_from_deps=True,
5151
default_mtime=None,
52-
preserve_tar_mtimes=True):
52+
preserve_tar_mtimes=True,
53+
compresslevel=None):
5354
"""TarFileWriter wraps tarfile.open().
5455
5556
Args:
@@ -86,10 +87,11 @@ def __init__(self,
8687
else:
8788
mode = 'w:'
8889
if compression in ['tgz', 'gz']:
90+
compresslevel = int(compresslevel) if compresslevel or compresslevel == 0 else 6
8991
# The Tarfile class doesn't allow us to specify gzip's mtime attribute.
9092
# Instead, we manually reimplement gzopen from tarfile.py and set mtime.
9193
self.fileobj = gzip.GzipFile(
92-
filename=name, mode='w', compresslevel=6, mtime=self.default_mtime)
94+
filename=name, mode='w', compresslevel=compresslevel, mtime=self.default_mtime)
9395
self.compressor_proc = None
9496
if self.compressor_cmd:
9597
mode = 'w|'

pkg/tar.bzl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# limitations under the License.
1414
"""Forwarder for pkg_tar."""
1515

16-
load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar")
16+
load("//pkg/private/tar:tar.bzl", _pkg_tar = "pkg_tar", _pkg_tar_group = "pkg_tar_group")
1717

1818
pkg_tar = _pkg_tar
19+
pkg_tar_group = _pkg_tar_group

0 commit comments

Comments
 (0)