Skip to content

Commit caecb6c

Browse files
jeffhostetlermjcheetham
authored andcommitted
survey: show some commits/trees/blobs histograms
With this commit, we gather statistics about the sizes of commits, trees, and blobs in the repository, and then present them in the form of "hexbins", i.e. log(16) histograms that show how many objects fall into the 0..15 bytes range, the 16..255 range, the 256..4095 range, etc. For commits, we also show the total count grouped by the number of parents, and for trees we additionally show the total count grouped by number of entries in the form of "qbins", i.e. log(4) histograms. Signed-off-by: Jeff Hostetler <jeffhostetler@github.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
1 parent 45fa069 commit caecb6c

File tree

1 file changed

+334
-4
lines changed

1 file changed

+334
-4
lines changed

builtin/survey.c

Lines changed: 334 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@
1616
#include "strbuf.h"
1717
#include "strvec.h"
1818
#include "trace2.h"
19+
#include "tree.h"
20+
#include "tree-walk.h"
1921

2022
static const char * const survey_usage[] = {
2123
N_("(EXPERIMENTAL!) git survey <options>"),
@@ -69,11 +71,162 @@ struct survey_report_ref_summary {
6971
size_t len_sum_remote_refnames;
7072
};
7173

74+
/*
75+
* HBIN -- hex binning (histogram bucketing).
76+
*
77+
* We create histograms for various counts and sums. Since we have a
78+
* wide range of values (objects range in size from 1 to 4G bytes), a
79+
* linear bucketing is not interesting. Instead, lets use a
80+
* log16()-based bucketing. This gives us a better spread on the low
81+
* and middle range and a coarse bucketing on the high end.
82+
*
83+
* The idea here is that it doesn't matter if you have n 1GB blobs or
84+
* n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling
85+
* problem that we want to report on.
86+
*/
87+
#define HBIN_LEN (sizeof(unsigned long) * 2)
88+
#define HBIN_MASK (0xF)
89+
#define HBIN_SHIFT (4)
90+
91+
static int hbin(unsigned long value)
92+
{
93+
for (int k = 0; k < HBIN_LEN; k++) {
94+
if ((value & ~(HBIN_MASK)) == 0)
95+
return k;
96+
value >>= HBIN_SHIFT;
97+
}
98+
99+
return 0; /* should not happen */
100+
}
101+
102+
/*
103+
* QBIN -- base4 binning (histogram bucketing).
104+
*
105+
* This is the same idea as the above, but we want better granularity
106+
* in the low end and don't expect as many large values.
107+
*/
108+
#define QBIN_LEN (sizeof(unsigned long) * 4)
109+
#define QBIN_MASK (0x3)
110+
#define QBIN_SHIFT (2)
111+
112+
static int qbin(unsigned long value)
113+
{
114+
for (int k = 0; k < QBIN_LEN; k++) {
115+
if ((value & ~(QBIN_MASK)) == 0)
116+
return k;
117+
value >>= (QBIN_SHIFT);
118+
}
119+
120+
return 0; /* should not happen */
121+
}
122+
123+
/*
124+
* histogram bin for objects.
125+
*/
126+
struct obj_hist_bin {
127+
uint64_t sum_size; /* sum(object_size) for all objects in this bin */
128+
uint64_t sum_disk_size; /* sum(on_disk_size) for all objects in this bin */
129+
uint32_t cnt_seen; /* number seen in this bin */
130+
};
131+
132+
static void incr_obj_hist_bin(struct obj_hist_bin *pbin,
133+
unsigned long object_length,
134+
off_t disk_sizep)
135+
{
136+
pbin->sum_size += object_length;
137+
pbin->sum_disk_size += disk_sizep;
138+
pbin->cnt_seen++;
139+
}
140+
141+
/*
142+
* Common fields for any type of object.
143+
*/
144+
struct survey_stats_base_object {
145+
uint32_t cnt_seen;
146+
147+
uint32_t cnt_missing; /* we may have a partial clone. */
148+
149+
/*
150+
* Number of objects grouped by where they are stored on disk.
151+
* This is a function of how the ODB is packed.
152+
*/
153+
uint32_t cnt_cached; /* see oi.whence */
154+
uint32_t cnt_loose; /* see oi.whence */
155+
uint32_t cnt_packed; /* see oi.whence */
156+
uint32_t cnt_dbcached; /* see oi.whence */
157+
158+
uint64_t sum_size; /* sum(object_size) */
159+
uint64_t sum_disk_size; /* sum(disk_size) */
160+
161+
/*
162+
* A histogram of the count of objects, the observed size, and
163+
* the on-disk size grouped by the observed size.
164+
*/
165+
struct obj_hist_bin size_hbin[HBIN_LEN];
166+
};
167+
168+
/*
169+
* PBIN -- parent vector binning (histogram bucketing).
170+
*
171+
* We create a histogram based upon the number of parents
172+
* in a commit. This is a simple linear vector. It starts
173+
* at zero for "initial" commits.
174+
*
175+
* If a commit has more parents, just put it in the last bin.
176+
*/
177+
#define PBIN_VEC_LEN (32)
178+
179+
struct survey_stats_commits {
180+
struct survey_stats_base_object base;
181+
182+
/*
183+
* Count of commits with k parents.
184+
*/
185+
uint32_t parent_cnt_pbin[PBIN_VEC_LEN];
186+
};
187+
188+
/*
189+
* Stats for reachable trees.
190+
*/
191+
struct survey_stats_trees {
192+
struct survey_stats_base_object base;
193+
194+
/*
195+
* In the following, nr_entries refers to the number of files or
196+
* subdirectories in a tree. We are interested in how wide the
197+
* tree is and if the repo has gigantic directories.
198+
*/
199+
uint64_t max_entries; /* max(nr_entries) -- the width of the largest tree */
200+
201+
/*
202+
* Computing the sum of the number of entries across all trees
203+
* is probably not that interesting.
204+
*/
205+
uint64_t sum_entries; /* sum(nr_entries) -- sum across all trees */
206+
207+
/*
208+
* A histogram of the count of trees, the observed size, and
209+
* the on-disk size grouped by the number of entries in the tree.
210+
*/
211+
struct obj_hist_bin entry_qbin[QBIN_LEN];
212+
};
213+
214+
/*
215+
* Stats for reachable blobs.
216+
*/
217+
struct survey_stats_blobs {
218+
struct survey_stats_base_object base;
219+
};
220+
72221
struct survey_report_object_summary {
73222
size_t commits_nr;
74223
size_t tags_nr;
75224
size_t trees_nr;
76225
size_t blobs_nr;
226+
227+
struct survey_stats_commits commits;
228+
struct survey_stats_trees trees;
229+
struct survey_stats_blobs blobs;
77230
};
78231

79232
/**
@@ -363,6 +516,98 @@ static void print_table_plaintext(struct survey_table *table)
363516
free(column_widths);
364517
}
365518

519+
static void pretty_print_bin_table(const char *title_caption,
520+
const char *bucket_header,
521+
struct obj_hist_bin *bin,
522+
uint64_t bin_len, int bin_shift, uint64_t bin_mask)
523+
{
524+
struct survey_table table = SURVEY_TABLE_INIT;
525+
struct strbuf bucket = STRBUF_INIT, cnt_seen = STRBUF_INIT;
526+
struct strbuf sum_size = STRBUF_INIT, sum_disk_size = STRBUF_INIT;
527+
uint64_t lower = 0;
528+
uint64_t upper = bin_mask;
529+
530+
table.table_name = title_caption;
531+
strvec_pushl(&table.header, bucket_header, "Count", "Size", "Disk Size", NULL);
532+
533+
for (int k = 0; k < bin_len; k++) {
534+
struct obj_hist_bin *p = bin + k;
535+
uintmax_t lower_k = lower;
536+
uintmax_t upper_k = upper;
537+
538+
lower = upper+1;
539+
upper = (upper << bin_shift) + bin_mask;
540+
541+
if (!p->cnt_seen)
542+
continue;
543+
544+
strbuf_reset(&bucket);
545+
strbuf_addf(&bucket, "%"PRIuMAX"..%"PRIuMAX, lower_k, upper_k);
546+
547+
strbuf_reset(&cnt_seen);
548+
strbuf_addf(&cnt_seen, "%"PRIuMAX, (uintmax_t)p->cnt_seen);
549+
550+
strbuf_reset(&sum_size);
551+
strbuf_addf(&sum_size, "%"PRIuMAX, (uintmax_t)p->sum_size);
552+
553+
strbuf_reset(&sum_disk_size);
554+
strbuf_addf(&sum_disk_size, "%"PRIuMAX, (uintmax_t)p->sum_disk_size);
555+
556+
insert_table_rowv(&table, bucket.buf,
557+
cnt_seen.buf, sum_size.buf, sum_disk_size.buf, NULL);
558+
}
559+
strbuf_release(&bucket);
560+
strbuf_release(&cnt_seen);
561+
strbuf_release(&sum_size);
562+
strbuf_release(&sum_disk_size);
563+
564+
print_table_plaintext(&table);
565+
clear_table(&table);
566+
}
567+
568+
static void survey_report_hbin(const char *title_caption,
569+
struct obj_hist_bin *bin)
570+
{
571+
pretty_print_bin_table(title_caption,
572+
"Byte Range",
573+
bin,
574+
HBIN_LEN, HBIN_SHIFT, HBIN_MASK);
575+
}
576+
577+
static void survey_report_tree_lengths(struct survey_context *ctx)
578+
{
579+
pretty_print_bin_table(_("TREE HISTOGRAM BY NUMBER OF ENTRIES"),
580+
"Entry Range",
581+
ctx->report.reachable_objects.trees.entry_qbin,
582+
QBIN_LEN, QBIN_SHIFT, QBIN_MASK);
583+
}
584+
585+
static void survey_report_commit_parents(struct survey_context *ctx)
586+
{
587+
struct survey_stats_commits *psc = &ctx->report.reachable_objects.commits;
588+
struct survey_table table = SURVEY_TABLE_INIT;
589+
struct strbuf parents = STRBUF_INIT, counts = STRBUF_INIT;
590+
591+
table.table_name = _("HISTOGRAM BY NUMBER OF COMMIT PARENTS");
592+
strvec_pushl(&table.header, "Parents", "Counts", NULL);
593+
594+
for (int k = 0; k < PBIN_VEC_LEN; k++)
595+
if (psc->parent_cnt_pbin[k]) {
596+
strbuf_reset(&parents);
597+
strbuf_addf(&parents, "%02d", k);
598+
599+
strbuf_reset(&counts);
600+
strbuf_addf(&counts, "%14"PRIuMAX, (uintmax_t)psc->parent_cnt_pbin[k]);
601+
602+
insert_table_rowv(&table, parents.buf, counts.buf, NULL);
603+
}
604+
strbuf_release(&parents);
605+
strbuf_release(&counts);
606+
607+
print_table_plaintext(&table);
608+
clear_table(&table);
609+
}
610+
366611
static void survey_report_plaintext_refs(struct survey_context *ctx)
367612
{
368613
struct survey_report_ref_summary *refs = &ctx->report.refs;
@@ -515,6 +760,19 @@ static void survey_report_plaintext(struct survey_context *ctx)
515760
ctx->report.by_type,
516761
REPORT_TYPE_COUNT);
517762

763+
survey_report_commit_parents(ctx);
764+
765+
survey_report_hbin(_("COMMITS HISTOGRAM BY SIZE IN BYTES"),
766+
ctx->report.reachable_objects.commits.base.size_hbin);
767+
768+
survey_report_tree_lengths(ctx);
769+
770+
survey_report_hbin(_("TREES HISTOGRAM BY SIZE IN BYTES"),
771+
ctx->report.reachable_objects.trees.base.size_hbin);
772+
773+
survey_report_hbin(_("BLOBS HISTOGRAM BY SIZE IN BYTES"),
774+
ctx->report.reachable_objects.blobs.base.size_hbin);
775+
518776
survey_report_plaintext_sorted_size(
519777
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
520778
survey_report_plaintext_sorted_size(
@@ -783,6 +1041,8 @@ static void increment_totals(struct survey_context *ctx,
7831041
unsigned long object_length = 0;
7841042
off_t disk_sizep = 0;
7851043
enum object_type type;
1044+
struct survey_stats_base_object *base;
1045+
int hb;
7861046

7871047
oi.typep = &type;
7881048
oi.sizep = &object_length;
@@ -791,11 +1051,81 @@ static void increment_totals(struct survey_context *ctx,
7911051
if (oid_object_info_extended(ctx->repo, &oids->oid[i],
7921052
&oi, oi_flags) < 0) {
7931053
summary->num_missing++;
794-
} else {
795-
summary->nr++;
796-
summary->disk_size += disk_sizep;
797-
summary->inflated_size += object_length;
1054+
continue;
1055+
}
1056+
1057+
summary->nr++;
1058+
summary->disk_size += disk_sizep;
1059+
summary->inflated_size += object_length;
1060+
1061+
switch (type) {
1062+
case OBJ_COMMIT: {
1063+
struct commit *commit = lookup_commit(ctx->repo, &oids->oid[i]);
1064+
unsigned k = commit_list_count(commit->parents);
1065+
1066+
if (k >= PBIN_VEC_LEN)
1067+
k = PBIN_VEC_LEN - 1;
1068+
1069+
ctx->report.reachable_objects.commits.parent_cnt_pbin[k]++;
1070+
base = &ctx->report.reachable_objects.commits.base;
1071+
break;
7981072
}
1073+
case OBJ_TREE: {
1074+
struct tree *tree = lookup_tree(ctx->repo, &oids->oid[i]);
1075+
if (tree) {
1076+
struct survey_stats_trees *pst = &ctx->report.reachable_objects.trees;
1077+
struct tree_desc desc;
1078+
struct name_entry entry;
1079+
int nr_entries;
1080+
int qb;
1081+
1082+
parse_tree(tree);
1083+
init_tree_desc(&desc, &oids->oid[i], tree->buffer, tree->size);
1084+
nr_entries = 0;
1085+
while (tree_entry(&desc, &entry))
1086+
nr_entries++;
1087+
1088+
pst->sum_entries += nr_entries;
1089+
1090+
if (nr_entries > pst->max_entries)
1091+
pst->max_entries = nr_entries;
1092+
1093+
qb = qbin(nr_entries);
1094+
incr_obj_hist_bin(&pst->entry_qbin[qb], object_length, disk_sizep);
1095+
}
1096+
base = &ctx->report.reachable_objects.trees.base;
1097+
break;
1098+
}
1099+
case OBJ_BLOB:
1100+
base = &ctx->report.reachable_objects.blobs.base;
1101+
break;
1102+
default:
1103+
continue;
1104+
}
1105+
1106+
switch (oi.whence) {
1107+
case OI_CACHED:
1108+
base->cnt_cached++;
1109+
break;
1110+
case OI_LOOSE:
1111+
base->cnt_loose++;
1112+
break;
1113+
case OI_PACKED:
1114+
base->cnt_packed++;
1115+
break;
1116+
case OI_DBCACHED:
1117+
base->cnt_dbcached++;
1118+
break;
1119+
default:
1120+
break;
1121+
}
1122+
1123+
base->sum_size += object_length;
1124+
base->sum_disk_size += disk_sizep;
1125+
1126+
hb = hbin(object_length);
1127+
incr_obj_hist_bin(&base->size_hbin[hb], object_length, disk_sizep);
1128+
7991129
}
8001130
}
8011131

0 commit comments

Comments
 (0)