Skip to content

Commit e3b30bc

Browse files
ellishgthevinster
andauthored
[lld][InstrProf] Profile guided function order (#96268)
Add the lld flags `--irpgo-profile-sort=<profile>` and `--compression-sort={function,data,both}` to order functions to improve startup time, and functions or data to improve compressed size, respectively. We use Balanced Partitioning to determine the best section order using traces from IRPGO profiles (see https://discourse.llvm.org/t/rfc-temporal-profiling-extension-for-irpgo/68068 for details) to improve startup time and using hashes of section contents to improve compressed size. In our recent LLVM talk (https://www.youtube.com/watch?v=yd4pbSTjwuA), we showed that this can reduce page faults during startup by 40% on a large iOS app and we can reduce compressed size by 0.8-3%. More details can be found in https://dl.acm.org/doi/10.1145/3660635 --------- Co-authored-by: Vincent Lee <thevinster@users.noreply.github.com>
1 parent b42fe67 commit e3b30bc

10 files changed

+740
-1
lines changed

lld/MachO/BPSectionOrderer.cpp

Lines changed: 413 additions & 0 deletions
Large diffs are not rendered by default.

lld/MachO/BPSectionOrderer.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
///
9+
/// This file uses Balanced Partitioning to order sections to improve startup
10+
/// time and compressed size.
11+
///
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef LLD_MACHO_BPSECTION_ORDERER_H
15+
#define LLD_MACHO_BPSECTION_ORDERER_H
16+
17+
#include "llvm/ADT/DenseMap.h"
18+
#include "llvm/ADT/StringRef.h"
19+
20+
namespace lld::macho {
21+
22+
class InputSection;
23+
24+
/// Run Balanced Partitioning to find the optimal function and data order to
25+
/// improve startup time and compressed size.
26+
///
27+
/// It is important that .subsections_via_symbols is used to ensure functions
28+
/// and data are in their own sections and thus can be reordered.
29+
llvm::DenseMap<const lld::macho::InputSection *, size_t>
30+
runBalancedPartitioning(size_t &highestAvailablePriority,
31+
llvm::StringRef profilePath,
32+
bool forFunctionCompression, bool forDataCompression,
33+
bool verbose);
34+
35+
} // namespace lld::macho
36+
37+
#endif

lld/MachO/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ add_lld_library(lldMachO
2525
OutputSection.cpp
2626
OutputSegment.cpp
2727
Relocations.cpp
28+
BPSectionOrderer.cpp
2829
SectionPriorities.cpp
2930
Sections.cpp
3031
SymbolTable.cpp
@@ -48,6 +49,7 @@ add_lld_library(lldMachO
4849
Object
4950
Option
5051
Passes
52+
ProfileData
5153
Support
5254
TargetParser
5355
TextAPI

lld/MachO/Config.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,11 @@ struct Configuration {
217217
bool callGraphProfileSort = false;
218218
llvm::StringRef printSymbolOrder;
219219

220+
llvm::StringRef irpgoProfileSortProfilePath;
221+
bool functionOrderForCompression = false;
222+
bool dataOrderForCompression = false;
223+
bool verboseBpSectionOrderer = false;
224+
220225
SectionRenameMap sectionRenameMap;
221226
SegmentRenameMap segmentRenameMap;
222227

lld/MachO/Driver.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1750,6 +1750,34 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
17501750
OPT_no_warn_thin_archive_missing_members, true);
17511751
config->generateUuid = !args.hasArg(OPT_no_uuid);
17521752

1753+
auto IncompatWithCGSort = [&](StringRef firstArgStr) {
1754+
// Throw an error only if --call-graph-profile-sort is explicitly specified
1755+
if (config->callGraphProfileSort)
1756+
if (const Arg *arg = args.getLastArgNoClaim(OPT_call_graph_profile_sort))
1757+
error(firstArgStr + " is incompatible with " + arg->getSpelling());
1758+
};
1759+
if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
1760+
config->irpgoProfileSortProfilePath = arg->getValue();
1761+
IncompatWithCGSort(arg->getSpelling());
1762+
}
1763+
if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
1764+
StringRef compressionSortStr = arg->getValue();
1765+
if (compressionSortStr == "function") {
1766+
config->functionOrderForCompression = true;
1767+
} else if (compressionSortStr == "data") {
1768+
config->dataOrderForCompression = true;
1769+
} else if (compressionSortStr == "both") {
1770+
config->functionOrderForCompression = true;
1771+
config->dataOrderForCompression = true;
1772+
} else if (compressionSortStr != "none") {
1773+
error("unknown value `" + compressionSortStr + "` for " +
1774+
arg->getSpelling());
1775+
}
1776+
if (compressionSortStr != "none")
1777+
IncompatWithCGSort(arg->getSpelling());
1778+
}
1779+
config->verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);
1780+
17531781
for (const Arg *arg : args.filtered(OPT_alias)) {
17541782
config->aliasedSymbols.push_back(
17551783
std::make_pair(arg->getValue(0), arg->getValue(1)));

lld/MachO/Options.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
126126
def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
127127
HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
128128
Group<grp_lld>;
129+
def irpgo_profile_sort: Joined<["--"], "irpgo-profile-sort=">,
130+
MetaVarName<"<profile>">,
131+
HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
132+
Group<grp_lld>;
133+
def compression_sort: Joined<["--"], "compression-sort=">,
134+
MetaVarName<"[none,function,data,both]">,
135+
HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
136+
def verbose_bp_section_orderer: Flag<["--"], "verbose-bp-section-orderer">,
137+
HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">,
138+
Group<grp_lld>;
129139
def ignore_auto_link_option : Separate<["--"], "ignore-auto-link-option">,
130140
Group<grp_lld>;
131141
def ignore_auto_link_option_eq : Joined<["--"], "ignore-auto-link-option=">,

lld/MachO/SectionPriorities.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
//===----------------------------------------------------------------------===//
1313

1414
#include "SectionPriorities.h"
15+
#include "BPSectionOrderer.h"
1516
#include "Config.h"
1617
#include "InputFiles.h"
1718
#include "Symbols.h"
@@ -352,7 +353,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
352353
DenseMap<const InputSection *, size_t>
353354
macho::PriorityBuilder::buildInputSectionPriorities() {
354355
DenseMap<const InputSection *, size_t> sectionPriorities;
355-
if (config->callGraphProfileSort) {
356+
if (!config->irpgoProfileSortProfilePath.empty() ||
357+
config->functionOrderForCompression || config->dataOrderForCompression) {
358+
TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
359+
sectionPriorities = runBalancedPartitioning(
360+
highestAvailablePriority, config->irpgoProfileSortProfilePath,
361+
config->functionOrderForCompression, config->dataOrderForCompression,
362+
config->verboseBpSectionOrderer);
363+
} else if (config->callGraphProfileSort) {
356364
// Sort sections by the profile data provided by __LLVM,__cg_profile
357365
// sections.
358366
//
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# RUN: not %lld -o /dev/null --irpgo-profile-sort=%s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
2+
# IRPGO-ERR: --irpgo-profile-sort= is incompatible with --call-graph-profile-sort
3+
4+
# RUN: not %lld -o /dev/null --compression-sort=function --call-graph-profile-sort %s 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
5+
# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-profile-sort
6+
7+
# RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
8+
# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# REQUIRES: aarch64
2+
3+
# Generate a large test case and check that the output is deterministic.
4+
5+
# RUN: %python %s %t.s %t.proftext
6+
7+
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
8+
# RUN: llvm-profdata merge %t.proftext -o %t.profdata
9+
10+
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
11+
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
12+
# RUN: diff %t.order1.txt %t.order2.txt
13+
14+
import random
15+
import sys
16+
17+
assembly_filepath = sys.argv[1]
18+
proftext_filepath = sys.argv[2]
19+
20+
random.seed(1234)
21+
num_functions = 1000
22+
num_data = 100
23+
num_traces = 10
24+
25+
function_names = [f"f{n}" for n in range(num_functions)]
26+
data_names = [f"d{n}" for n in range(num_data)]
27+
profiled_functions = function_names[: int(num_functions / 2)]
28+
29+
function_contents = [
30+
f"""
31+
{name}:
32+
add w0, w0, #{i % 4096}
33+
add w1, w1, #{i % 10}
34+
add w2, w0, #{i % 20}
35+
adrp x3, {name}@PAGE
36+
ret
37+
"""
38+
for i, name in enumerate(function_names)
39+
]
40+
41+
data_contents = [
42+
f"""
43+
{name}:
44+
.ascii "s{i % 2}-{i % 3}-{i % 5}"
45+
.xword {name}
46+
"""
47+
for i, name in enumerate(data_names)
48+
]
49+
50+
trace_contents = [
51+
f"""
52+
# Weight
53+
1
54+
{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
55+
"""
56+
for i in range(num_traces)
57+
]
58+
59+
profile_contents = [
60+
f"""
61+
{name}
62+
# Func Hash:
63+
{i}
64+
# Num Counters:
65+
1
66+
# Counter Values:
67+
1
68+
"""
69+
for i, name in enumerate(profiled_functions)
70+
]
71+
72+
with open(assembly_filepath, "w") as f:
73+
f.write(
74+
f"""
75+
.text
76+
.globl _main
77+
78+
_main:
79+
ret
80+
81+
{"".join(function_contents)}
82+
83+
.data
84+
{"".join(data_contents)}
85+
86+
.subsections_via_symbols
87+
"""
88+
)
89+
90+
with open(proftext_filepath, "w") as f:
91+
f.write(
92+
f"""
93+
:ir
94+
:temporal_prof_traces
95+
96+
# Num Traces
97+
{num_traces}
98+
# Trace Stream Size:
99+
{num_traces}
100+
101+
{"".join(trace_contents)}
102+
103+
{"".join(profile_contents)}
104+
"""
105+
)

lld/test/MachO/bp-section-orderer.s

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# REQUIRES: aarch64
2+
3+
# RUN: rm -rf %t && split-file %s %t
4+
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
5+
# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata
6+
7+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
8+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP
9+
10+
# STARTUP: Ordered 3 sections using balanced partitioning
11+
12+
# RUN: %lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE
13+
14+
# ORDERFILE: A
15+
# ORDERFILE: F
16+
# ORDERFILE: E
17+
# ORDERFILE: D
18+
# ORDERFILE-DAG: _main
19+
# ORDERFILE-DAG: _B
20+
# ORDERFILE-DAG: l_C
21+
# ORDERFILE-DAG: s1
22+
# ORDERFILE-DAG: s2
23+
# ORDERFILE-DAG: r1
24+
# ORDERFILE-DAG: r2
25+
26+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
27+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
28+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
29+
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
30+
31+
# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
32+
# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
33+
# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning
34+
35+
#--- a.s
36+
.text
37+
.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
38+
39+
_main:
40+
ret
41+
A:
42+
ret
43+
_B:
44+
add w0, w0, #1
45+
bl A
46+
ret
47+
l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
48+
add w0, w0, #2
49+
bl A
50+
ret
51+
D:
52+
add w0, w0, #2
53+
bl _B
54+
ret
55+
E:
56+
add w0, w0, #2
57+
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
58+
ret
59+
F:
60+
add w0, w0, #3
61+
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
62+
ret
63+
64+
.data
65+
s1:
66+
.ascii "hello world"
67+
s2:
68+
.ascii "i am a string"
69+
r1:
70+
.quad s1
71+
r2:
72+
.quad r1
73+
74+
.subsections_via_symbols
75+
76+
#--- a.proftext
77+
:ir
78+
:temporal_prof_traces
79+
# Num Traces
80+
1
81+
# Trace Stream Size:
82+
1
83+
# Weight
84+
1
85+
A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
86+
87+
A
88+
# Func Hash:
89+
1111
90+
# Num Counters:
91+
1
92+
# Counter Values:
93+
1
94+
95+
B
96+
# Func Hash:
97+
2222
98+
# Num Counters:
99+
1
100+
# Counter Values:
101+
1
102+
103+
C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
104+
# Func Hash:
105+
3333
106+
# Num Counters:
107+
1
108+
# Counter Values:
109+
1
110+
111+
D
112+
# Func Hash:
113+
4444
114+
# Num Counters:
115+
1
116+
# Counter Values:
117+
1
118+
119+
#--- a.orderfile
120+
A
121+
F
122+
E
123+
D

0 commit comments

Comments
 (0)