Skip to content

[lld][InstrProf] Profile guided function order #96268

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jul 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
413 changes: 413 additions & 0 deletions lld/MachO/BPSectionOrderer.cpp

Large diffs are not rendered by default.

37 changes: 37 additions & 0 deletions lld/MachO/BPSectionOrderer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
//===- BPSectionOrderer.h ---------------------------------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file uses Balanced Partitioning to order sections to improve startup
/// time and compressed size.
///
//===----------------------------------------------------------------------===//

#ifndef LLD_MACHO_BPSECTION_ORDERER_H
#define LLD_MACHO_BPSECTION_ORDERER_H

#include "llvm/ADT/DenseMap.h"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For IWYU also include headers for StringRef and InputSection. Forward declarations can be used as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I already have a forward declaration for InputSection, but I've added StringRef.h.

#include "llvm/ADT/StringRef.h"

namespace lld::macho {

class InputSection;

/// Run Balanced Partitioning to find the optimal function and data order to
/// improve startup time and compressed size.
///
/// It is important that .subsections_via_symbols is used to ensure functions
/// and data are in their own sections and thus can be reordered.
llvm::DenseMap<const lld::macho::InputSection *, size_t>
runBalancedPartitioning(size_t &highestAvailablePriority,
llvm::StringRef profilePath,
bool forFunctionCompression, bool forDataCompression,
bool verbose);

} // namespace lld::macho

#endif
2 changes: 2 additions & 0 deletions lld/MachO/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ add_lld_library(lldMachO
OutputSection.cpp
OutputSegment.cpp
Relocations.cpp
BPSectionOrderer.cpp
SectionPriorities.cpp
SymbolTable.cpp
Symbols.cpp
Expand All @@ -47,6 +48,7 @@ add_lld_library(lldMachO
Object
Option
Passes
ProfileData
Support
TargetParser
TextAPI
Expand Down
5 changes: 5 additions & 0 deletions lld/MachO/Config.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,11 @@ struct Configuration {
bool callGraphProfileSort = false;
llvm::StringRef printSymbolOrder;

llvm::StringRef irpgoProfileSortProfilePath;
bool functionOrderForCompression = false;
bool dataOrderForCompression = false;
bool verboseBpSectionOrderer = false;

SectionRenameMap sectionRenameMap;
SegmentRenameMap segmentRenameMap;

Expand Down
28 changes: 28 additions & 0 deletions lld/MachO/Driver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,34 @@ bool link(ArrayRef<const char *> argsArr, llvm::raw_ostream &stdoutOS,
OPT_no_warn_thin_archive_missing_members, true);
config->generateUuid = !args.hasArg(OPT_no_uuid);

auto IncompatWithCGSort = [&](StringRef firstArgStr) {
// Throw an error only if --call-graph-profile-sort is explicitly specified
if (config->callGraphProfileSort)
if (const Arg *arg = args.getLastArgNoClaim(OPT_call_graph_profile_sort))
error(firstArgStr + " is incompatible with " + arg->getSpelling());
};
if (const Arg *arg = args.getLastArg(OPT_irpgo_profile_sort)) {
config->irpgoProfileSortProfilePath = arg->getValue();
IncompatWithCGSort(arg->getSpelling());
}
if (const Arg *arg = args.getLastArg(OPT_compression_sort)) {
StringRef compressionSortStr = arg->getValue();
if (compressionSortStr == "function") {
config->functionOrderForCompression = true;
} else if (compressionSortStr == "data") {
config->dataOrderForCompression = true;
} else if (compressionSortStr == "both") {
config->functionOrderForCompression = true;
config->dataOrderForCompression = true;
} else if (compressionSortStr != "none") {
error("unknown value `" + compressionSortStr + "` for " +
arg->getSpelling());
}
if (compressionSortStr != "none")
IncompatWithCGSort(arg->getSpelling());
}
config->verboseBpSectionOrderer = args.hasArg(OPT_verbose_bp_section_orderer);

for (const Arg *arg : args.filtered(OPT_alias)) {
config->aliasedSymbols.push_back(
std::make_pair(arg->getValue(0), arg->getValue(1)));
Expand Down
10 changes: 10 additions & 0 deletions lld/MachO/Options.td
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,16 @@ def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">,
def print_symbol_order_eq: Joined<["--"], "print-symbol-order=">,
HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">,
Group<grp_lld>;
def irpgo_profile_sort: Joined<["--"], "irpgo-profile-sort=">,
MetaVarName<"<profile>">,
HelpText<"Read the IRPGO profile at <profile> to order sections to improve startup time">,
Group<grp_lld>;
def compression_sort: Joined<["--"], "compression-sort=">,
MetaVarName<"[none,function,data,both]">,
HelpText<"Order sections to improve compressed size">, Group<grp_lld>;
def verbose_bp_section_orderer: Flag<["--"], "verbose-bp-section-orderer">,
HelpText<"Print information on how many sections were ordered by balanced partitioning and a measure of the expected number of page faults">,
Group<grp_lld>;
def ignore_auto_link_option : Separate<["--"], "ignore-auto-link-option">,
Group<grp_lld>;
def ignore_auto_link_option_eq : Joined<["--"], "ignore-auto-link-option=">,
Expand Down
10 changes: 9 additions & 1 deletion lld/MachO/SectionPriorities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//

#include "SectionPriorities.h"
#include "BPSectionOrderer.h"
#include "Config.h"
#include "InputFiles.h"
#include "Symbols.h"
Expand Down Expand Up @@ -352,7 +353,14 @@ void macho::PriorityBuilder::parseOrderFile(StringRef path) {
DenseMap<const InputSection *, size_t>
macho::PriorityBuilder::buildInputSectionPriorities() {
DenseMap<const InputSection *, size_t> sectionPriorities;
if (config->callGraphProfileSort) {
if (!config->irpgoProfileSortProfilePath.empty() ||
config->functionOrderForCompression || config->dataOrderForCompression) {
TimeTraceScope timeScope("Balanced Partitioning Section Orderer");
sectionPriorities = runBalancedPartitioning(
highestAvailablePriority, config->irpgoProfileSortProfilePath,
config->functionOrderForCompression, config->dataOrderForCompression,
config->verboseBpSectionOrderer);
} else if (config->callGraphProfileSort) {
// Sort sections by the profile data provided by __LLVM,__cg_profile
// sections.
//
Expand Down
8 changes: 8 additions & 0 deletions lld/test/MachO/bp-section-orderer-errs.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# RUN: not %lld -o /dev/null --irpgo-profile-sort=%s --call-graph-profile-sort 2>&1 | FileCheck %s --check-prefix=IRPGO-ERR
# IRPGO-ERR: --irpgo-profile-sort= is incompatible with --call-graph-profile-sort

# RUN: not %lld -o /dev/null --compression-sort=function --call-graph-profile-sort %s 2>&1 | FileCheck %s --check-prefix=COMPRESSION-ERR
# COMPRESSION-ERR: --compression-sort= is incompatible with --call-graph-profile-sort

# RUN: not %lld -o /dev/null --compression-sort=malformed 2>&1 | FileCheck %s --check-prefix=COMPRESSION-MALFORM
# COMPRESSION-MALFORM: unknown value `malformed` for --compression-sort=
105 changes: 105 additions & 0 deletions lld/test/MachO/bp-section-orderer-stress.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# REQUIRES: aarch64

# Generate a large test case and check that the output is deterministic.

# RUN: %python %s %t.s %t.proftext

# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t.s -o %t.o
# RUN: llvm-profdata merge %t.proftext -o %t.profdata

# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order1.txt
# RUN: %lld -arch arm64 -lSystem -e _main --icf=all -o - %t.o --irpgo-profile-sort=%t.profdata --compression-sort=both | llvm-nm --numeric-sort --format=just-symbols - > %t.order2.txt
# RUN: diff %t.order1.txt %t.order2.txt

import random
import sys

assembly_filepath = sys.argv[1]
proftext_filepath = sys.argv[2]

random.seed(1234)
num_functions = 1000
num_data = 100
num_traces = 10

function_names = [f"f{n}" for n in range(num_functions)]
data_names = [f"d{n}" for n in range(num_data)]
profiled_functions = function_names[: int(num_functions / 2)]

function_contents = [
f"""
{name}:
add w0, w0, #{i % 4096}
add w1, w1, #{i % 10}
add w2, w0, #{i % 20}
adrp x3, {name}@PAGE
ret
"""
for i, name in enumerate(function_names)
]

data_contents = [
f"""
{name}:
.ascii "s{i % 2}-{i % 3}-{i % 5}"
.xword {name}
"""
for i, name in enumerate(data_names)
]

trace_contents = [
f"""
# Weight
1
{", ".join(random.sample(profiled_functions, len(profiled_functions)))}
"""
for i in range(num_traces)
]

profile_contents = [
f"""
{name}
# Func Hash:
{i}
# Num Counters:
1
# Counter Values:
1
"""
for i, name in enumerate(profiled_functions)
]

with open(assembly_filepath, "w") as f:
f.write(
f"""
.text
.globl _main

_main:
ret

{"".join(function_contents)}

.data
{"".join(data_contents)}

.subsections_via_symbols
"""
)

with open(proftext_filepath, "w") as f:
f.write(
f"""
:ir
:temporal_prof_traces

# Num Traces
{num_traces}
# Trace Stream Size:
{num_traces}

{"".join(trace_contents)}

{"".join(profile_contents)}
"""
)
123 changes: 123 additions & 0 deletions lld/test/MachO/bp-section-orderer.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# REQUIRES: aarch64

# RUN: rm -rf %t && split-file %s %t
# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %t/a.s -o %t/a.o
# RUN: llvm-profdata merge %t/a.proftext -o %t/a.profdata

# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer 2>&1 | FileCheck %s --check-prefix=STARTUP
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --irpgo-profile-sort=%t/a.profdata --verbose-bp-section-orderer --icf=all --compression-sort=none 2>&1 | FileCheck %s --check-prefix=STARTUP

# STARTUP: Ordered 3 sections using balanced partitioning

# RUN: %lld -arch arm64 -lSystem -e _main -o - %t/a.o --irpgo-profile-sort=%t/a.profdata -order_file %t/a.orderfile | llvm-nm --numeric-sort --format=just-symbols - | FileCheck %s --check-prefix=ORDERFILE

# ORDERFILE: A
# ORDERFILE: F
# ORDERFILE: E
# ORDERFILE: D
# ORDERFILE-DAG: _main
# ORDERFILE-DAG: _B
# ORDERFILE-DAG: l_C
# ORDERFILE-DAG: s1
# ORDERFILE-DAG: s2
# ORDERFILE-DAG: r1
# ORDERFILE-DAG: r2

# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=function 2>&1 | FileCheck %s --check-prefix=COMPRESSION-FUNC
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=data 2>&1 | FileCheck %s --check-prefix=COMPRESSION-DATA
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH
# RUN: %lld -arch arm64 -lSystem -e _main -o %t/a.out %t/a.o --verbose-bp-section-orderer --compression-sort=both --irpgo-profile-sort=%t/a.profdata 2>&1 | FileCheck %s --check-prefix=COMPRESSION-BOTH

# COMPRESSION-FUNC: Ordered 7 sections using balanced partitioning
# COMPRESSION-DATA: Ordered 4 sections using balanced partitioning
# COMPRESSION-BOTH: Ordered 11 sections using balanced partitioning

#--- a.s
.text
.globl _main, A, _B, l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222

_main:
ret
A:
ret
_B:
add w0, w0, #1
bl A
ret
l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222:
add w0, w0, #2
bl A
ret
D:
add w0, w0, #2
bl _B
ret
E:
add w0, w0, #2
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret
F:
add w0, w0, #3
bl l_C.__uniq.111111111111111111111111111111111111111.llvm.2222222222222222222
ret

.data
s1:
.ascii "hello world"
s2:
.ascii "i am a string"
r1:
.quad s1
r2:
.quad r1

.subsections_via_symbols

#--- a.proftext
:ir
:temporal_prof_traces
# Num Traces
1
# Trace Stream Size:
1
# Weight
1
A, B, C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666

A
# Func Hash:
1111
# Num Counters:
1
# Counter Values:
1

B
# Func Hash:
2222
# Num Counters:
1
# Counter Values:
1

C.__uniq.555555555555555555555555555555555555555.llvm.6666666666666666666
# Func Hash:
3333
# Num Counters:
1
# Counter Values:
1

D
# Func Hash:
4444
# Num Counters:
1
# Counter Values:
1

#--- a.orderfile
A
F
E
D
Loading