|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | +// This file contains icalls used in jitted interpreter traces and wrappers, |
| 4 | +// along with infrastructure to support code generration |
| 5 | + |
| 6 | +// This file implements most of interpreter automatic PGO. |
| 7 | +// Loading/saving the actual table is your responsibility via mono_interp_pgo_(load|save)_table |
| 8 | + |
| 9 | +#ifndef __USE_ISOC99 |
| 10 | +#define __USE_ISOC99 |
| 11 | +#endif |
| 12 | +#include "config.h" |
| 13 | + |
| 14 | +// We start with a fixed-size table and then grow it by a given ratio when we run out of space |
| 15 | +// Generally speaking size doubling is suboptimal so we use a 1.5x ratio |
| 16 | +#define TABLE_MINIMUM_SIZE 4096 |
| 17 | +#define TABLE_GROWTH_FACTOR 150 |
| 18 | +#define INTERP_PGO_LOG_INTERVAL_MS 10 |
| 19 | + |
| 20 | +#include <mono/metadata/mono-config.h> |
| 21 | +#include <mono/utils/mono-threads.h> |
| 22 | +#include <mono/utils/mono-time.h> |
| 23 | +#include <mono/utils/bsearch.h> |
| 24 | + |
| 25 | +#include "interp.h" |
| 26 | +#include "interp-internals.h" |
| 27 | +#include "transform.h" |
| 28 | +#include "interp-intrins.h" |
| 29 | +#include "tiering.h" |
| 30 | + |
| 31 | +#include "interp-pgo.h" |
| 32 | + |
| 33 | +#include <string.h> |
| 34 | +#include <stdlib.h> |
| 35 | +#include <math.h> |
| 36 | + |
| 37 | +#include <mono/utils/options.h> |
| 38 | +#include <mono/utils/atomic.h> |
| 39 | + |
| 40 | + |
| 41 | +// MurmurHash3 was written by Austin Appleby, and is placed in the public |
| 42 | +// domain. The author hereby disclaims copyright to this source code. |
| 43 | +// |
| 44 | +// Implementation was copied from https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp |
| 45 | +// with changes around strict-aliasing/unaligned reads |
| 46 | + |
| 47 | +#define MM3_HASH_BYTE_SIZE 16 // MurMurHash3 is 128-bit, so we need 16 bytes to store it |
| 48 | +#define MM3_HASH_BUFFER_SIZE 33 // MurMurHash3 is 128-bit, so we need 32 chars + 1 char to store null-terminator |
| 49 | + |
| 50 | +inline static uint64_t ROTL64(uint64_t x, int8_t r) |
| 51 | +{ |
| 52 | + return (x << r) | (x >> (64 - r)); |
| 53 | +} |
| 54 | + |
| 55 | +inline static uint64_t getblock64(const uint8_t* ptr) |
| 56 | +{ |
| 57 | + uint64_t val = 0; |
| 58 | + memcpy(&val, ptr, sizeof(uint64_t)); |
| 59 | + return val; |
| 60 | +} |
| 61 | + |
| 62 | +inline static void setblock64(uint8_t* ptr, uint64_t val) |
| 63 | +{ |
| 64 | + memcpy(ptr, &val, sizeof(uint64_t)); |
| 65 | +} |
| 66 | + |
| 67 | +// Finalization mix - force all bits of a hash block to avalanche |
| 68 | +inline static uint64_t fmix64(uint64_t k) |
| 69 | +{ |
| 70 | + k ^= k >> 33; |
| 71 | + k *= 0xff51afd7ed558ccdLLU; |
| 72 | + k ^= k >> 33; |
| 73 | + k *= 0xc4ceb9fe1a85ec53LLU; |
| 74 | + k ^= k >> 33; |
| 75 | + return k; |
| 76 | +} |
| 77 | + |
| 78 | +static void MurmurHash3_128(const void* key, const size_t len, const uint32_t seed, uint8_t out[MM3_HASH_BYTE_SIZE]) |
| 79 | +{ |
| 80 | + const uint8_t* data = (const uint8_t*)key; |
| 81 | + const size_t nblocks = len / MM3_HASH_BYTE_SIZE; |
| 82 | + uint64_t h1 = seed; |
| 83 | + uint64_t h2 = seed; |
| 84 | + const uint64_t c1 = 0x87c37b91114253d5LLU; |
| 85 | + const uint64_t c2 = 0x4cf5ad432745937fLLU; |
| 86 | + |
| 87 | + // body |
| 88 | + for (size_t i = 0; i < nblocks; i++) |
| 89 | + { |
| 90 | + uint64_t k1 = getblock64(data + (i * 2 + 0) * sizeof(uint64_t)); |
| 91 | + uint64_t k2 = getblock64(data + (i * 2 + 1) * sizeof(uint64_t)); |
| 92 | + |
| 93 | + k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; |
| 94 | + h1 = ROTL64(h1, 27); h1 += h2; h1 = h1 * 5 + 0x52dce729; |
| 95 | + k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; |
| 96 | + h2 = ROTL64(h2, 31); h2 += h1; h2 = h2 * 5 + 0x38495ab5; |
| 97 | + } |
| 98 | + |
| 99 | + // tail |
| 100 | + const uint8_t* tail = data + nblocks * MM3_HASH_BYTE_SIZE; |
| 101 | + uint64_t k1 = 0; |
| 102 | + uint64_t k2 = 0; |
| 103 | + |
| 104 | + switch (len & 15) |
| 105 | + { |
| 106 | + case 15: k2 ^= (uint64_t)(tail[14]) << 48; |
| 107 | + case 14: k2 ^= (uint64_t)(tail[13]) << 40; |
| 108 | + case 13: k2 ^= (uint64_t)(tail[12]) << 32; |
| 109 | + case 12: k2 ^= (uint64_t)(tail[11]) << 24; |
| 110 | + case 11: k2 ^= (uint64_t)(tail[10]) << 16; |
| 111 | + case 10: k2 ^= (uint64_t)(tail[9]) << 8; |
| 112 | + case 9: k2 ^= (uint64_t)(tail[8]) << 0; |
| 113 | + k2 *= c2; k2 = ROTL64(k2, 33); k2 *= c1; h2 ^= k2; |
| 114 | + |
| 115 | + case 8: k1 ^= (uint64_t)(tail[7]) << 56; |
| 116 | + case 7: k1 ^= (uint64_t)(tail[6]) << 48; |
| 117 | + case 6: k1 ^= (uint64_t)(tail[5]) << 40; |
| 118 | + case 5: k1 ^= (uint64_t)(tail[4]) << 32; |
| 119 | + case 4: k1 ^= (uint64_t)(tail[3]) << 24; |
| 120 | + case 3: k1 ^= (uint64_t)(tail[2]) << 16; |
| 121 | + case 2: k1 ^= (uint64_t)(tail[1]) << 8; |
| 122 | + case 1: k1 ^= (uint64_t)(tail[0]) << 0; |
| 123 | + k1 *= c1; k1 = ROTL64(k1, 31); k1 *= c2; h1 ^= k1; |
| 124 | + break; |
| 125 | + } |
| 126 | + |
| 127 | + // finalization |
| 128 | + h1 ^= len; |
| 129 | + h2 ^= len; |
| 130 | + h1 += h2; |
| 131 | + h2 += h1; |
| 132 | + h1 = fmix64(h1); |
| 133 | + h2 = fmix64(h2); |
| 134 | + h1 += h2; |
| 135 | + h2 += h1; |
| 136 | + |
| 137 | + setblock64((uint8_t*)(out), h1); |
| 138 | + setblock64((uint8_t*)(out) + sizeof(uint64_t), h2); |
| 139 | +} |
| 140 | + |
| 141 | +// end of murmurhash |
| 142 | + |
| 143 | + |
| 144 | +static gint64 generate_started, generate_total_time; |
| 145 | +static gint32 generate_depth; |
| 146 | + |
| 147 | +static gint32 |
| 148 | +ms_from_100ns_ticks (gint64 ticks) { |
| 149 | + return (int)((ticks + 500) / 1000); |
| 150 | +} |
| 151 | + |
| 152 | +void |
| 153 | +mono_interp_pgo_generate_start (void) { |
| 154 | + if (!mono_opt_interp_codegen_timing) |
| 155 | + return; |
| 156 | + |
| 157 | + if (mono_atomic_inc_i32 (&generate_depth) == 1) |
| 158 | + generate_started = mono_100ns_ticks (); |
| 159 | +} |
| 160 | + |
| 161 | +void |
| 162 | +mono_interp_pgo_generate_end (void) { |
| 163 | + if (!mono_opt_interp_codegen_timing) |
| 164 | + return; |
| 165 | + if (mono_atomic_dec_i32 (&generate_depth) != 0) |
| 166 | + return; |
| 167 | + |
| 168 | + gint64 elapsed = mono_100ns_ticks () - generate_started, |
| 169 | + new_total = mono_atomic_add_i64 (&generate_total_time, elapsed); |
| 170 | + gint32 total_ms = ms_from_100ns_ticks (new_total), |
| 171 | + prior_total_ms = ms_from_100ns_ticks (new_total - elapsed); |
| 172 | + |
| 173 | + if ((total_ms / INTERP_PGO_LOG_INTERVAL_MS) != (prior_total_ms / INTERP_PGO_LOG_INTERVAL_MS)) |
| 174 | + g_printf ("generate_code elapsed time: %dms\n", total_ms); |
| 175 | +} |
| 176 | + |
| 177 | + |
| 178 | +typedef struct { |
| 179 | + uint8_t *data; |
| 180 | + uint32_t size, capacity; |
| 181 | +} interp_pgo_table; |
| 182 | + |
| 183 | +// loaded_table is the table we loaded from persistent storage at startup (if any), |
| 184 | +// while building_table is the table we built during the current run. we store these |
| 185 | +// separately so that we don't have to maintain a sorted table (for bsearch) on an |
| 186 | +// ongoing basis. |
| 187 | +static interp_pgo_table *loaded_table, *building_table; |
| 188 | +// Loaded_table is immutable once loaded, so it has no mutex. Any access to building_table |
| 189 | +// needs to be performed while holding this mutex. |
| 190 | +static mono_mutex_t building_table_lock; |
| 191 | + |
| 192 | +static int |
| 193 | +hash_comparer (const void *needle, const void *haystack) |
| 194 | +{ |
| 195 | + return memcmp (needle, haystack, MM3_HASH_BYTE_SIZE); |
| 196 | +} |
| 197 | + |
| 198 | +static gboolean |
| 199 | +table_lookup (interp_pgo_table *table, uint8_t hash[MM3_HASH_BYTE_SIZE]) { |
| 200 | + // Early out if no table is loaded or the table is empty. |
| 201 | + if (!table || !table->size) |
| 202 | + return FALSE; |
| 203 | + |
| 204 | + g_assert (table->size <= table->capacity); |
| 205 | + |
| 206 | + void * result = mono_binary_search (hash, table->data, table->size / MM3_HASH_BYTE_SIZE, MM3_HASH_BYTE_SIZE, hash_comparer); |
| 207 | + return (result != NULL); |
| 208 | +} |
| 209 | + |
| 210 | +static void |
| 211 | +table_add_locked (interp_pgo_table **table_variable, uint8_t hash[MM3_HASH_BYTE_SIZE]) { |
| 212 | + interp_pgo_table *table = *table_variable; |
| 213 | + // If we don't have a table yet, allocate one |
| 214 | + if (!table) |
| 215 | + *table_variable = table = g_malloc0 (sizeof (interp_pgo_table)); |
| 216 | + |
| 217 | + const uint32_t required_size = table->size + MM3_HASH_BYTE_SIZE, |
| 218 | + required_capacity = MAX (required_size, TABLE_MINIMUM_SIZE); |
| 219 | + |
| 220 | + // If we're out of space or haven't yet allocated a buffer for this table, calculate |
| 221 | + // an appropriate larger size and grow/allocate the buffer. We start at a fixed size, |
| 222 | + // then after that grow the current size by a set ratio per step. |
| 223 | + while (required_capacity >= table->capacity) { |
| 224 | + uint32_t new_capacity = MAX (required_capacity, (table->capacity * TABLE_GROWTH_FACTOR / 100)); |
| 225 | + if (table->data) |
| 226 | + table->data = g_realloc (table->data, new_capacity); |
| 227 | + else |
| 228 | + table->data = g_malloc0 (new_capacity); |
| 229 | + table->capacity = new_capacity; |
| 230 | + } |
| 231 | + |
| 232 | + // Copy the whole hash into the table at the end and update the size of the data |
| 233 | + memcpy (table->data + table->size, hash, MM3_HASH_BYTE_SIZE); |
| 234 | + table->size = required_size; |
| 235 | +} |
| 236 | + |
| 237 | +static void |
| 238 | +table_sort_locked (interp_pgo_table *table) { |
| 239 | + mono_qsort (table->data, table->size / MM3_HASH_BYTE_SIZE, MM3_HASH_BYTE_SIZE, hash_comparer); |
| 240 | +} |
| 241 | + |
| 242 | +static void |
| 243 | +compute_method_hash (MonoMethod *method, uint8_t outbuf[MM3_HASH_BYTE_SIZE]) { |
| 244 | + // method token + image guid |
| 245 | + size_t size = sizeof(uint32_t) + 16; |
| 246 | + uint32_t *inbuf = alloca (size); |
| 247 | + // method tokens are globally unique within a given assembly |
| 248 | + inbuf[0] = mono_method_get_token (method); |
| 249 | + // use the assembly guid as a unique id for the assembly |
| 250 | + MonoImage *image = m_class_get_image (mono_method_get_class (method)); |
| 251 | + memcpy (inbuf + 1, mono_image_get_guid (image), 16); |
| 252 | + |
| 253 | + MurmurHash3_128 (inbuf, size, 0x43219876, (uint8_t *)outbuf); |
| 254 | +} |
| 255 | + |
| 256 | +gboolean |
| 257 | +mono_interp_pgo_should_tier_method (MonoMethod *method) { |
| 258 | + // If we didn't load a table, don't bother hashing the method. |
| 259 | + if (!loaded_table) |
| 260 | + return FALSE; |
| 261 | + |
| 262 | + uint8_t hash[MM3_HASH_BYTE_SIZE]; |
| 263 | + compute_method_hash (method, hash); |
| 264 | + |
| 265 | + if (table_lookup (loaded_table, hash)) { |
| 266 | + if (mono_opt_interp_pgo_logging) { |
| 267 | + char * name = mono_method_full_name (method, TRUE); |
| 268 | + g_print ("Tiering %s because it was in the interp_pgo table\n", name); |
| 269 | + g_free (name); |
| 270 | + } |
| 271 | + |
| 272 | + return TRUE; |
| 273 | + } |
| 274 | + |
| 275 | + return FALSE; |
| 276 | +} |
| 277 | + |
| 278 | +void |
| 279 | +mono_interp_pgo_method_was_tiered (MonoMethod *method) { |
| 280 | + if (!mono_opt_interp_pgo_recording) |
| 281 | + return; |
| 282 | + |
| 283 | + // Wrappers are already tiered automatically, so we don't put them in the table |
| 284 | + if (method->wrapper_type != MONO_WRAPPER_NONE) |
| 285 | + return; |
| 286 | + |
| 287 | + uint8_t hash[MM3_HASH_BYTE_SIZE] = {0}; |
| 288 | + compute_method_hash (method, hash); |
| 289 | + |
| 290 | + mono_os_mutex_lock (&building_table_lock); |
| 291 | + table_add_locked (&building_table, hash); |
| 292 | + mono_os_mutex_unlock (&building_table_lock); |
| 293 | + |
| 294 | + if (mono_opt_interp_pgo_logging) { |
| 295 | + char * name = mono_method_full_name (method, TRUE); |
| 296 | + g_print ("added %s to table\n", name); |
| 297 | + g_free (name); |
| 298 | + } |
| 299 | +} |
| 300 | + |
| 301 | +#if HOST_BROWSER |
| 302 | + |
| 303 | +#include <emscripten.h> |
| 304 | + |
| 305 | +// We disable this diagnostic because EMSCRIPTEN_KEEPALIVE makes it a false alarm, the keepalive |
| 306 | +// functions are being used externally. Having a bunch of prototypes is pointless since these |
| 307 | +// functions are not consumed by C anywhere else |
| 308 | +#pragma clang diagnostic ignored "-Wmissing-prototypes" |
| 309 | + |
| 310 | +EMSCRIPTEN_KEEPALIVE int |
| 311 | +mono_interp_pgo_load_table (uint8_t * data, int data_size) { |
| 312 | + // Early-out if a table is already loaded. |
| 313 | + if (loaded_table) |
| 314 | + return 1; |
| 315 | + // If the data we were passed is too small then early out |
| 316 | + if (data_size < sizeof(uint32_t)) |
| 317 | + return 3; |
| 318 | + |
| 319 | + interp_pgo_table *result = g_malloc0 (sizeof (interp_pgo_table)); |
| 320 | + // The table storage format is [uint32 size] [data...] |
| 321 | + uint32_t size = *(uint32_t *)data; |
| 322 | + |
| 323 | + if (mono_opt_interp_pgo_logging) |
| 324 | + g_print ("Loading %d bytes of interp_pgo data (table size == %zu)\n", data_size, size); |
| 325 | + |
| 326 | + result->data = g_malloc0 (data_size); |
| 327 | + g_assert ((int64_t)size < (int64_t)data_size); |
| 328 | + result->size = size; |
| 329 | + result->capacity = data_size; |
| 330 | + memcpy (result->data, data + sizeof (uint32_t), result->size); |
| 331 | + |
| 332 | + // Atomically swap the new table in |
| 333 | + interp_pgo_table *old_table = mono_atomic_cas_ptr ((volatile gpointer*)&loaded_table, result, NULL); |
| 334 | + |
| 335 | + if (old_table) { |
| 336 | + // We lost a race with another thread that also loaded a table, so destroy ours and leave |
| 337 | + // theirs in place. |
| 338 | + if (result->data) |
| 339 | + g_free (result->data); |
| 340 | + g_free (result); |
| 341 | + |
| 342 | + return 2; |
| 343 | + } |
| 344 | + |
| 345 | + return 0; |
| 346 | +} |
| 347 | + |
| 348 | +EMSCRIPTEN_KEEPALIVE int |
| 349 | +mono_interp_pgo_save_table (uint8_t * data, int data_size) { |
| 350 | + if (!building_table) |
| 351 | + return 0; |
| 352 | + |
| 353 | + mono_os_mutex_lock (&building_table_lock); |
| 354 | + interp_pgo_table *table = building_table; |
| 355 | + int expected_size = table->size + sizeof (uint32_t); |
| 356 | + if (data_size != expected_size) { |
| 357 | + mono_os_mutex_unlock (&building_table_lock); |
| 358 | + return expected_size; |
| 359 | + } |
| 360 | + table_sort_locked (table); |
| 361 | + // The table storage format is [uint32 size] [data...] |
| 362 | + memcpy (data, &table->size, sizeof (uint32_t)); |
| 363 | + memcpy (data + sizeof (uint32_t), table->data, table->size); |
| 364 | + mono_os_mutex_unlock (&building_table_lock); |
| 365 | + return 0; |
| 366 | +} |
| 367 | + |
| 368 | +#endif // HOST_BROWSER |
0 commit comments