Skip to content

Commit 8bce5a8

Browse files
authored
[wasm] Interpreter automatic PGO (#92981)
Add infrastructure for interpreter PGO, which immediately tiers methods on first compilation if they are in a table from previous runs. Add automatic interpreter PGO support for wasm which can be enabled via builder configuration. Add runtime option that enables basic measurement and logging of time spent generating code in the interp. Add a WBT test for interpreter automatic PGO.
1 parent c5cb147 commit 8bce5a8

File tree

21 files changed

+831
-69
lines changed

21 files changed

+831
-69
lines changed

eng/testing/scenarios/BuildWasmAppsJobsList.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ Wasm.Build.NativeRebuild.Tests.NoopNativeRebuildTest
33
Wasm.Build.NativeRebuild.Tests.OptimizationFlagChangeTests
44
Wasm.Build.NativeRebuild.Tests.ReferenceNewAssemblyRebuildTest
55
Wasm.Build.NativeRebuild.Tests.SimpleSourceChangeRebuildTest
6+
Wasm.Build.Templates.Tests.InterpPgoTests
67
Wasm.Build.Templates.Tests.NativeBuildTests
78
Wasm.Build.Tests.Blazor.AppsettingsTests
89
Wasm.Build.Tests.Blazor.BuildPublishTests

src/mono/mono/mini/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -300,7 +300,8 @@ set(interp_sources
300300
interp/transform.c
301301
interp/tiering.h
302302
interp/tiering.c
303-
interp/jiterpreter.c)
303+
interp/jiterpreter.c
304+
interp/interp-pgo.c)
304305
set(interp_simd_sources
305306
interp/interp-simd.c)
306307
set(interp_stub_sources
Lines changed: 368 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,368 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// This file contains icalls used in jitted interpreter traces and wrappers,
4+
// along with infrastructure to support code generration
5+
6+
// This file implements most of interpreter automatic PGO.
7+
// Loading/saving the actual table is your responsibility via mono_interp_pgo_(load|save)_table
8+
9+
#ifndef __USE_ISOC99
10+
#define __USE_ISOC99
11+
#endif
12+
#include "config.h"
13+
14+
// We start with a fixed-size table and then grow it by a given ratio when we run out of space
15+
// Generally speaking size doubling is suboptimal so we use a 1.5x ratio
16+
#define TABLE_MINIMUM_SIZE 4096
17+
#define TABLE_GROWTH_FACTOR 150
18+
#define INTERP_PGO_LOG_INTERVAL_MS 10
19+
20+
#include <mono/metadata/mono-config.h>
21+
#include <mono/utils/mono-threads.h>
22+
#include <mono/utils/mono-time.h>
23+
#include <mono/utils/bsearch.h>
24+
25+
#include "interp.h"
26+
#include "interp-internals.h"
27+
#include "transform.h"
28+
#include "interp-intrins.h"
29+
#include "tiering.h"
30+
31+
#include "interp-pgo.h"
32+
33+
#include <string.h>
34+
#include <stdlib.h>
35+
#include <math.h>
36+
37+
#include <mono/utils/options.h>
38+
#include <mono/utils/atomic.h>
39+
40+
41+
// MurmurHash3 was written by Austin Appleby, and is placed in the public
42+
// domain. The author hereby disclaims copyright to this source code.
43+
//
44+
// Implementation was copied from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
45+
// with changes around strict-aliasing/unaligned reads
46+
47+
#define MM3_HASH_BYTE_SIZE 16 // MurMurHash3 is 128-bit, so we need 16 bytes to store it
48+
#define MM3_HASH_BUFFER_SIZE 33 // MurMurHash3 is 128-bit, so we need 32 chars + 1 char to store null-terminator
49+
50+
inline static uint64_t ROTL64(uint64_t x, int8_t r)
51+
{
52+
return (x << r) | (x >> (64 - r));
53+
}
54+
55+
inline static uint64_t getblock64(const uint8_t* ptr)
56+
{
57+
uint64_t val = 0;
58+
memcpy(&val, ptr, sizeof(uint64_t));
59+
return val;
60+
}
61+
62+
inline static void setblock64(uint8_t* ptr, uint64_t val)
63+
{
64+
memcpy(ptr, &val, sizeof(uint64_t));
65+
}
66+
67+
// Finalization mix - force all bits of a hash block to avalanche
68+
inline static uint64_t fmix64(uint64_t k)
69+
{
70+
k ^= k >> 33;
71+
k *= 0xff51afd7ed558ccdLLU;
72+
k ^= k >> 33;
73+
k *= 0xc4ceb9fe1a85ec53LLU;
74+
k ^= k >> 33;
75+
return k;
76+
}
77+
78+
static void MurmurHash3_128(const void* key, const size_t len, const uint32_t seed, uint8_t out[MM3_HASH_BYTE_SIZE])
79+
{
80+
const uint8_t* data = (const uint8_t*)key;
81+
const size_t nblocks = len / MM3_HASH_BYTE_SIZE;
82+
uint64_t h1 = seed;
83+
uint64_t h2 = seed;
84+
const uint64_t c1 = 0x87c37b91114253d5LLU;
85+
const uint64_t c2 = 0x4cf5ad432745937fLLU;
86+
87+
// body
88+
for (size_t i = 0; i < nblocks; i++)
89+
{
90+
uint64_t k1 = getblock64(data + (i * 2 + 0) * sizeof(uint64_t));
91+
uint64_t k2 = getblock64(data + (i * 2 + 1) * sizeof(uint64_t));
92+
93+
k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1;
94+
h1 = ROTL64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729;
95+
k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2;
96+
h2 = ROTL64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5;
97+
}
98+
99+
// tail
100+
const uint8_t* tail = data + nblocks * MM3_HASH_BYTE_SIZE;
101+
uint64_t k1 = 0;
102+
uint64_t k2 = 0;
103+
104+
switch (len & 15)
105+
{
106+
case 15: k2 ^= (uint64_t)(tail[14]) << 48;
107+
case 14: k2 ^= (uint64_t)(tail[13]) << 40;
108+
case 13: k2 ^= (uint64_t)(tail[12]) << 32;
109+
case 12: k2 ^= (uint64_t)(tail[11]) << 24;
110+
case 11: k2 ^= (uint64_t)(tail[10]) << 16;
111+
case 10: k2 ^= (uint64_t)(tail[9]) << 8;
112+
case 9: k2 ^= (uint64_t)(tail[8]) << 0;
113+
k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2;
114+
115+
case 8: k1 ^= (uint64_t)(tail[7]) << 56;
116+
case 7: k1 ^= (uint64_t)(tail[6]) << 48;
117+
case 6: k1 ^= (uint64_t)(tail[5]) << 40;
118+
case 5: k1 ^= (uint64_t)(tail[4]) << 32;
119+
case 4: k1 ^= (uint64_t)(tail[3]) << 24;
120+
case 3: k1 ^= (uint64_t)(tail[2]) << 16;
121+
case 2: k1 ^= (uint64_t)(tail[1]) << 8;
122+
case 1: k1 ^= (uint64_t)(tail[0]) << 0;
123+
k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1;
124+
break;
125+
}
126+
127+
// finalization
128+
h1 ^= len;
129+
h2 ^= len;
130+
h1 += h2;
131+
h2 += h1;
132+
h1 = fmix64(h1);
133+
h2 = fmix64(h2);
134+
h1 += h2;
135+
h2 += h1;
136+
137+
setblock64((uint8_t*)(out), h1);
138+
setblock64((uint8_t*)(out) + sizeof(uint64_t), h2);
139+
}
140+
141+
// end of murmurhash
142+
143+
144+
static gint64 generate_started, generate_total_time;
145+
static gint32 generate_depth;
146+
147+
static gint32
148+
ms_from_100ns_ticks (gint64 ticks) {
149+
return (int)((ticks + 500) / 1000);
150+
}
151+
152+
void
153+
mono_interp_pgo_generate_start (void) {
154+
if (!mono_opt_interp_codegen_timing)
155+
return;
156+
157+
if (mono_atomic_inc_i32 (&generate_depth) == 1)
158+
generate_started = mono_100ns_ticks ();
159+
}
160+
161+
void
162+
mono_interp_pgo_generate_end (void) {
163+
if (!mono_opt_interp_codegen_timing)
164+
return;
165+
if (mono_atomic_dec_i32 (&generate_depth) != 0)
166+
return;
167+
168+
gint64 elapsed = mono_100ns_ticks () - generate_started,
169+
new_total = mono_atomic_add_i64 (&generate_total_time, elapsed);
170+
gint32 total_ms = ms_from_100ns_ticks (new_total),
171+
prior_total_ms = ms_from_100ns_ticks (new_total - elapsed);
172+
173+
if ((total_ms / INTERP_PGO_LOG_INTERVAL_MS) != (prior_total_ms / INTERP_PGO_LOG_INTERVAL_MS))
174+
g_printf ("generate_code elapsed time: %dms\n", total_ms);
175+
}
176+
177+
178+
typedef struct {
179+
uint8_t *data;
180+
uint32_t size, capacity;
181+
} interp_pgo_table;
182+
183+
// loaded_table is the table we loaded from persistent storage at startup (if any),
184+
// while building_table is the table we built during the current run. we store these
185+
// separately so that we don't have to maintain a sorted table (for bsearch) on an
186+
// ongoing basis.
187+
static interp_pgo_table *loaded_table, *building_table;
188+
// Loaded_table is immutable once loaded, so it has no mutex. Any access to building_table
189+
// needs to be performed while holding this mutex.
190+
static mono_mutex_t building_table_lock;
191+
192+
static int
193+
hash_comparer (const void *needle, const void *haystack)
194+
{
195+
return memcmp (needle, haystack, MM3_HASH_BYTE_SIZE);
196+
}
197+
198+
static gboolean
199+
table_lookup (interp_pgo_table *table, uint8_t hash[MM3_HASH_BYTE_SIZE]) {
200+
// Early out if no table is loaded or the table is empty.
201+
if (!table || !table->size)
202+
return FALSE;
203+
204+
g_assert (table->size <= table->capacity);
205+
206+
void * result = mono_binary_search (hash, table->data, table->size / MM3_HASH_BYTE_SIZE, MM3_HASH_BYTE_SIZE, hash_comparer);
207+
return (result != NULL);
208+
}
209+
210+
static void
211+
table_add_locked (interp_pgo_table **table_variable, uint8_t hash[MM3_HASH_BYTE_SIZE]) {
212+
interp_pgo_table *table = *table_variable;
213+
// If we don't have a table yet, allocate one
214+
if (!table)
215+
*table_variable = table = g_malloc0 (sizeof (interp_pgo_table));
216+
217+
const uint32_t required_size = table->size + MM3_HASH_BYTE_SIZE,
218+
required_capacity = MAX (required_size, TABLE_MINIMUM_SIZE);
219+
220+
// If we're out of space or haven't yet allocated a buffer for this table, calculate
221+
// an appropriate larger size and grow/allocate the buffer. We start at a fixed size,
222+
// then after that grow the current size by a set ratio per step.
223+
while (required_capacity >= table->capacity) {
224+
uint32_t new_capacity = MAX (required_capacity, (table->capacity * TABLE_GROWTH_FACTOR / 100));
225+
if (table->data)
226+
table->data = g_realloc (table->data, new_capacity);
227+
else
228+
table->data = g_malloc0 (new_capacity);
229+
table->capacity = new_capacity;
230+
}
231+
232+
// Copy the whole hash into the table at the end and update the size of the data
233+
memcpy (table->data + table->size, hash, MM3_HASH_BYTE_SIZE);
234+
table->size = required_size;
235+
}
236+
237+
static void
238+
table_sort_locked (interp_pgo_table *table) {
239+
mono_qsort (table->data, table->size / MM3_HASH_BYTE_SIZE, MM3_HASH_BYTE_SIZE, hash_comparer);
240+
}
241+
242+
static void
243+
compute_method_hash (MonoMethod *method, uint8_t outbuf[MM3_HASH_BYTE_SIZE]) {
244+
// method token + image guid
245+
size_t size = sizeof(uint32_t) + 16;
246+
uint32_t *inbuf = alloca (size);
247+
// method tokens are globally unique within a given assembly
248+
inbuf[0] = mono_method_get_token (method);
249+
// use the assembly guid as a unique id for the assembly
250+
MonoImage *image = m_class_get_image (mono_method_get_class (method));
251+
memcpy (inbuf + 1, mono_image_get_guid (image), 16);
252+
253+
MurmurHash3_128 (inbuf, size, 0x43219876, (uint8_t *)outbuf);
254+
}
255+
256+
gboolean
257+
mono_interp_pgo_should_tier_method (MonoMethod *method) {
258+
// If we didn't load a table, don't bother hashing the method.
259+
if (!loaded_table)
260+
return FALSE;
261+
262+
uint8_t hash[MM3_HASH_BYTE_SIZE];
263+
compute_method_hash (method, hash);
264+
265+
if (table_lookup (loaded_table, hash)) {
266+
if (mono_opt_interp_pgo_logging) {
267+
char * name = mono_method_full_name (method, TRUE);
268+
g_print ("Tiering %s because it was in the interp_pgo table\n", name);
269+
g_free (name);
270+
}
271+
272+
return TRUE;
273+
}
274+
275+
return FALSE;
276+
}
277+
278+
void
279+
mono_interp_pgo_method_was_tiered (MonoMethod *method) {
280+
if (!mono_opt_interp_pgo_recording)
281+
return;
282+
283+
// Wrappers are already tiered automatically, so we don't put them in the table
284+
if (method->wrapper_type != MONO_WRAPPER_NONE)
285+
return;
286+
287+
uint8_t hash[MM3_HASH_BYTE_SIZE] = {0};
288+
compute_method_hash (method, hash);
289+
290+
mono_os_mutex_lock (&building_table_lock);
291+
table_add_locked (&building_table, hash);
292+
mono_os_mutex_unlock (&building_table_lock);
293+
294+
if (mono_opt_interp_pgo_logging) {
295+
char * name = mono_method_full_name (method, TRUE);
296+
g_print ("added %s to table\n", name);
297+
g_free (name);
298+
}
299+
}
300+
301+
#if HOST_BROWSER
302+
303+
#include <emscripten.h>
304+
305+
// We disable this diagnostic because EMSCRIPTEN_KEEPALIVE makes it a false alarm, the keepalive
306+
// functions are being used externally. Having a bunch of prototypes is pointless since these
307+
// functions are not consumed by C anywhere else
308+
#pragma clang diagnostic ignored "-Wmissing-prototypes"
309+
310+
EMSCRIPTEN_KEEPALIVE int
311+
mono_interp_pgo_load_table (uint8_t * data, int data_size) {
312+
// Early-out if a table is already loaded.
313+
if (loaded_table)
314+
return 1;
315+
// If the data we were passed is too small then early out
316+
if (data_size < sizeof(uint32_t))
317+
return 3;
318+
319+
interp_pgo_table *result = g_malloc0 (sizeof (interp_pgo_table));
320+
// The table storage format is [uint32 size] [data...]
321+
uint32_t size = *(uint32_t *)data;
322+
323+
if (mono_opt_interp_pgo_logging)
324+
g_print ("Loading %d bytes of interp_pgo data (table size == %zu)\n", data_size, size);
325+
326+
result->data = g_malloc0 (data_size);
327+
g_assert ((int64_t)size < (int64_t)data_size);
328+
result->size = size;
329+
result->capacity = data_size;
330+
memcpy (result->data, data + sizeof (uint32_t), result->size);
331+
332+
// Atomically swap the new table in
333+
interp_pgo_table *old_table = mono_atomic_cas_ptr ((volatile gpointer*)&loaded_table, result, NULL);
334+
335+
if (old_table) {
336+
// We lost a race with another thread that also loaded a table, so destroy ours and leave
337+
// theirs in place.
338+
if (result->data)
339+
g_free (result->data);
340+
g_free (result);
341+
342+
return 2;
343+
}
344+
345+
return 0;
346+
}
347+
348+
EMSCRIPTEN_KEEPALIVE int
349+
mono_interp_pgo_save_table (uint8_t * data, int data_size) {
350+
if (!building_table)
351+
return 0;
352+
353+
mono_os_mutex_lock (&building_table_lock);
354+
interp_pgo_table *table = building_table;
355+
int expected_size = table->size + sizeof (uint32_t);
356+
if (data_size != expected_size) {
357+
mono_os_mutex_unlock (&building_table_lock);
358+
return expected_size;
359+
}
360+
table_sort_locked (table);
361+
// The table storage format is [uint32 size] [data...]
362+
memcpy (data, &table->size, sizeof (uint32_t));
363+
memcpy (data + sizeof (uint32_t), table->data, table->size);
364+
mono_os_mutex_unlock (&building_table_lock);
365+
return 0;
366+
}
367+
368+
#endif // HOST_BROWSER

0 commit comments

Comments
 (0)