|
| 1 | +# This file is a part of Julia. License is MIT: https://julialang.org/license |
| 2 | + |
| 3 | +module HeapSnapshot |
| 4 | + |
| 5 | +""" |
| 6 | + assemble_snapshot(filepath::AbstractString, out_file::AbstractString) |
| 7 | +
|
| 8 | +Assemble a .heapsnapshot file from the .json files produced by `Profile.take_snapshot`. |
| 9 | +""" |
| 10 | + |
| 11 | +# SoA layout to reduce padding |
| 12 | +struct Edges |
| 13 | + type::Vector{Int8} # index into `snapshot.meta.edge_types` |
| 14 | + name_or_index::Vector{UInt} # Either an index into `snapshot.strings`, or the index in an array, depending on edge_type |
| 15 | + to_pos::Vector{UInt} # index into `snapshot.nodes` |
| 16 | +end |
| 17 | +function Edges(n::Int) |
| 18 | + Edges( |
| 19 | + Vector{Int8}(undef, n), |
| 20 | + Vector{UInt}(undef, n), |
| 21 | + Vector{UInt}(undef, n), |
| 22 | + ) |
| 23 | +end |
| 24 | +Base.length(n::Edges) = length(n.type) |
| 25 | + |
| 26 | +# trace_node_id and detachedness are always 0 in the snapshots Julia produces so we don't store them |
| 27 | +struct Nodes |
| 28 | + type::Vector{Int8} # index into `snapshot.meta.node_types` |
| 29 | + name_idx::Vector{UInt32} # index into `snapshot.strings` |
| 30 | + id::Vector{UInt} # unique id, in julia it is the address of the object |
| 31 | + self_size::Vector{Int} # size of the object itself, not including the size of its fields |
| 32 | + edge_count::Vector{UInt} # number of outgoing edges |
| 33 | + edges::Edges # outgoing edges |
| 34 | + # This is the main complexity of the .heapsnapshot format, and it's the reason we need |
| 35 | + # to read in all the data before writing it out. The edges vector contains all edges, |
| 36 | + # but organized by which node they came from. First, it contains all the edges coming |
| 37 | + # out of node 0, then all edges leaving node 1, etc. So we need to have visited all |
| 38 | + # edges, and assigned them to their corresponding nodes, before we can emit the file. |
| 39 | + edge_idxs::Vector{Vector{UInt}} # indexes into edges, keeping per-node outgoing edge ids |
| 40 | +end |
| 41 | +function Nodes(n::Int, e::Int) |
| 42 | + Nodes( |
| 43 | + Vector{Int8}(undef, n), |
| 44 | + Vector{UInt32}(undef, n), |
| 45 | + Vector{UInt}(undef, n), |
| 46 | + Vector{Int}(undef, n), |
| 47 | + Vector{UInt32}(undef, n), |
| 48 | + Edges(e), |
| 49 | + [Vector{UInt}() for _ in 1:n], # Take care to construct n separate empty vectors |
| 50 | + ) |
| 51 | +end |
| 52 | +Base.length(n::Nodes) = length(n.type) |
| 53 | + |
| 54 | +const k_node_number_of_fields = 7 |
| 55 | + |
| 56 | +# Like Base.dec, but doesn't allocate a string and writes directly to the io object |
| 57 | +# We know all of the numbers we're about to write fit into a UInt and are non-negative |
| 58 | +let _dec_d100 = UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99] |
| 59 | + global _write_decimal_number |
| 60 | + _write_decimal_number(io, x::Integer, buf) = _write_decimal_number(io, unsigned(x), buf) |
| 61 | + function _write_decimal_number(io, x::Unsigned, digits_buf) |
| 62 | + buf = digits_buf |
| 63 | + n = ndigits(x) |
| 64 | + i = n |
| 65 | + @inbounds while i >= 2 |
| 66 | + d, r = divrem(x, 0x64) |
| 67 | + d100 = _dec_d100[(r % Int)::Int + 1] |
| 68 | + buf[i-1] = d100 % UInt8 |
| 69 | + buf[i] = (d100 >> 0x8) % UInt8 |
| 70 | + x = oftype(x, d) |
| 71 | + i -= 2 |
| 72 | + end |
| 73 | + if i > 0 |
| 74 | + @inbounds buf[i] = 0x30 + (rem(x, 0xa) % UInt8)::UInt8 |
| 75 | + end |
| 76 | + write(io, @view buf[max(i, 1):n]) |
| 77 | + end |
| 78 | +end |
| 79 | + |
| 80 | +function assemble_snapshot(in_prefix, out_file::AbstractString = in_prefix) |
| 81 | + open(out_file, "w") do io |
| 82 | + assemble_snapshot(in_prefix, io) |
| 83 | + end |
| 84 | +end |
| 85 | + |
| 86 | +# Manually parse and write the .json files, given that we don't have JSON import/export in |
| 87 | +# julia's stdlibs. |
| 88 | +function assemble_snapshot(in_prefix, io::IO) |
| 89 | + preamble = read(string(in_prefix, ".metadata.json"), String) |
| 90 | + pos = last(findfirst("node_count\":", preamble)) + 1 |
| 91 | + endpos = findnext(==(','), preamble, pos) - 1 |
| 92 | + node_count = parse(Int, String(@view preamble[pos:endpos])) |
| 93 | + |
| 94 | + pos = last(findnext("edge_count\":", preamble, endpos)) + 1 |
| 95 | + endpos = findnext(==('}'), preamble, pos) - 1 |
| 96 | + edge_count = parse(Int, String(@view preamble[pos:endpos])) |
| 97 | + |
| 98 | + nodes = Nodes(node_count, edge_count) |
| 99 | + |
| 100 | + orphans = Set{UInt}() # nodes that have no incoming edges |
| 101 | + # Parse nodes with empty edge counts that we need to fill later |
| 102 | + nodes_file = open(string(in_prefix, ".nodes"), "r") |
| 103 | + for i in 1:length(nodes) |
| 104 | + node_type = read(nodes_file, Int8) |
| 105 | + node_name_idx = read(nodes_file, UInt) |
| 106 | + id = read(nodes_file, UInt) |
| 107 | + self_size = read(nodes_file, Int) |
| 108 | + @assert read(nodes_file, Int) == 0 # trace_node_id |
| 109 | + @assert read(nodes_file, Int8) == 0 # detachedness |
| 110 | + |
| 111 | + nodes.type[i] = node_type |
| 112 | + nodes.name_idx[i] = node_name_idx |
| 113 | + nodes.id[i] = id |
| 114 | + nodes.self_size[i] = self_size |
| 115 | + nodes.edge_count[i] = 0 # edge_count |
| 116 | + # populate the orphans set with node index |
| 117 | + push!(orphans, i-1) |
| 118 | + end |
| 119 | + |
| 120 | + # Parse the edges to fill in the edge counts for nodes and correct the to_node offsets |
| 121 | + edges_file = open(string(in_prefix, ".edges"), "r") |
| 122 | + for i in 1:length(nodes.edges) |
| 123 | + edge_type = read(edges_file, Int8) |
| 124 | + edge_name_or_index = read(edges_file, UInt) |
| 125 | + from_node = read(edges_file, UInt) |
| 126 | + to_node = read(edges_file, UInt) |
| 127 | + |
| 128 | + nodes.edges.type[i] = edge_type |
| 129 | + nodes.edges.name_or_index[i] = edge_name_or_index |
| 130 | + nodes.edges.to_pos[i] = to_node * k_node_number_of_fields # 7 fields per node, the streaming format doesn't multiply the offset by 7 |
| 131 | + nodes.edge_count[from_node + 1] += UInt32(1) # C and JSON use 0-based indexing |
| 132 | + push!(nodes.edge_idxs[from_node + 1], i) # Index into nodes.edges |
| 133 | + # remove the node from the orphans if it has at least one incoming edge |
| 134 | + if to_node in orphans |
| 135 | + delete!(orphans, to_node) |
| 136 | + end |
| 137 | + end |
| 138 | + |
| 139 | + _digits_buf = zeros(UInt8, ndigits(typemax(UInt))) |
| 140 | + println(io, @view(preamble[1:end-2]), ",") # remove trailing "}\n", we don't end the snapshot here |
| 141 | + println(io, "\"nodes\":[") |
| 142 | + for i in 1:length(nodes) |
| 143 | + i > 1 && println(io, ",") |
| 144 | + _write_decimal_number(io, nodes.type[i], _digits_buf) |
| 145 | + print(io, ",") |
| 146 | + _write_decimal_number(io, nodes.name_idx[i], _digits_buf) |
| 147 | + print(io, ",") |
| 148 | + _write_decimal_number(io, nodes.id[i], _digits_buf) |
| 149 | + print(io, ",") |
| 150 | + _write_decimal_number(io, nodes.self_size[i], _digits_buf) |
| 151 | + print(io, ",") |
| 152 | + _write_decimal_number(io, nodes.edge_count[i], _digits_buf) |
| 153 | + print(io, ",0,0") |
| 154 | + end |
| 155 | + print(io, "],\"edges\":[") |
| 156 | + e = 1 |
| 157 | + for n in 1:length(nodes) |
| 158 | + count = nodes.edge_count[n] |
| 159 | + len_edges = length(nodes.edge_idxs[n]) |
| 160 | + @assert count == len_edges "For node $n: $count != $len_edges" |
| 161 | + for i in nodes.edge_idxs[n] |
| 162 | + e > 1 && print(io, ",") |
| 163 | + println(io) |
| 164 | + _write_decimal_number(io, nodes.edges.type[i], _digits_buf) |
| 165 | + print(io, ",") |
| 166 | + _write_decimal_number(io, nodes.edges.name_or_index[i], _digits_buf) |
| 167 | + print(io, ",") |
| 168 | + _write_decimal_number(io, nodes.edges.to_pos[i], _digits_buf) |
| 169 | + if !(nodes.edges.to_pos[i] % k_node_number_of_fields == 0) |
| 170 | + @warn "Bug in to_pos for edge $i from node $n: $(nodes.edges.to_pos[i])" |
| 171 | + end |
| 172 | + e += 1 |
| 173 | + end |
| 174 | + end |
| 175 | + println(io, "],") |
| 176 | + |
| 177 | + println(io, "\"strings\":[") |
| 178 | + open(string(in_prefix, ".strings"), "r") do strings_io |
| 179 | + first = true |
| 180 | + while !eof(strings_io) |
| 181 | + str_size = read(strings_io, UInt) |
| 182 | + str_bytes = read(strings_io, str_size) |
| 183 | + str = String(str_bytes) |
| 184 | + if first |
| 185 | + print_str_escape_json(io, str) |
| 186 | + first = false |
| 187 | + else |
| 188 | + print(io, ",\n") |
| 189 | + print_str_escape_json(io, str) |
| 190 | + end |
| 191 | + end |
| 192 | + end |
| 193 | + print(io, "]}") |
| 194 | + |
| 195 | + # remove the uber node from the orphans |
| 196 | + if 0 in orphans |
| 197 | + delete!(orphans, 0) |
| 198 | + end |
| 199 | + |
| 200 | + @assert isempty(orphans) "Orphaned nodes: $(orphans), node count: $(length(nodes)), orphan node count: $(length(orphans))" |
| 201 | + |
| 202 | + return nothing |
| 203 | +end |
| 204 | + |
| 205 | +function print_str_escape_json(stream::IO, s::AbstractString) |
| 206 | + print(stream, '"') |
| 207 | + for c in s |
| 208 | + if c == '"' |
| 209 | + print(stream, "\\\"") |
| 210 | + elseif c == '\\' |
| 211 | + print(stream, "\\\\") |
| 212 | + elseif c == '\b' |
| 213 | + print(stream, "\\b") |
| 214 | + elseif c == '\f' |
| 215 | + print(stream, "\\f") |
| 216 | + elseif c == '\n' |
| 217 | + print(stream, "\\n") |
| 218 | + elseif c == '\r' |
| 219 | + print(stream, "\\r") |
| 220 | + elseif c == '\t' |
| 221 | + print(stream, "\\t") |
| 222 | + elseif '\x00' <= c <= '\x1f' |
| 223 | + print(stream, "\\u", lpad(string(UInt16(c), base=16), 4, '0')) |
| 224 | + else |
| 225 | + print(stream, c) |
| 226 | + end |
| 227 | + end |
| 228 | + print(stream, '"') |
| 229 | +end |
| 230 | + |
| 231 | +end |
0 commit comments