16
16
#include "strbuf.h"
17
17
#include "strvec.h"
18
18
#include "trace2.h"
19
+ #include "tree.h"
20
+ #include "tree-walk.h"
19
21
20
22
static const char * const survey_usage [] = {
21
23
N_ ("(EXPERIMENTAL!) git survey <options>" ),
@@ -69,11 +71,162 @@ struct survey_report_ref_summary {
69
71
size_t len_sum_remote_refnames ;
70
72
};
71
73
74
+ /*
75
+ * HBIN -- hex binning (histogram bucketing).
76
+ *
77
+ * We create histograms for various counts and sums. Since we have a
78
+ * wide range of values (objects range in size from 1 to 4G bytes), a
79
+ * linear bucketing is not interesting. Instead, lets use a
80
+ * log16()-based bucketing. This gives us a better spread on the low
81
+ * and middle range and a coarse bucketing on the high end.
82
+ *
83
+ * The idea here is that it doesn't matter if you have n 1GB blobs or
84
+ * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling
85
+ * problem that we want to report on.
86
+ */
87
+ #define HBIN_LEN (sizeof(unsigned long) * 2)
88
+ #define HBIN_MASK (0xF)
89
+ #define HBIN_SHIFT (4)
90
+
91
+ static int hbin (unsigned long value )
92
+ {
93
+ for (int k = 0 ; k < HBIN_LEN ; k ++ ) {
94
+ if ((value & ~(HBIN_MASK )) == 0 )
95
+ return k ;
96
+ value >>= HBIN_SHIFT ;
97
+ }
98
+
99
+ return 0 ; /* should not happen */
100
+ }
101
+
102
+ /*
103
+ * QBIN -- base4 binning (histogram bucketing).
104
+ *
105
+ * This is the same idea as the above, but we want better granularity
106
+ * in the low end and don't expect as many large values.
107
+ */
108
+ #define QBIN_LEN (sizeof(unsigned long) * 4)
109
+ #define QBIN_MASK (0x3)
110
+ #define QBIN_SHIFT (2)
111
+
112
+ static int qbin (unsigned long value )
113
+ {
114
+ for (int k = 0 ; k < QBIN_LEN ; k ++ ) {
115
+ if ((value & ~(QBIN_MASK )) == 0 )
116
+ return k ;
117
+ value >>= (QBIN_SHIFT );
118
+ }
119
+
120
+ return 0 ; /* should not happen */
121
+ }
122
+
123
+ /*
124
+ * histogram bin for objects.
125
+ */
126
+ struct obj_hist_bin {
127
+ uint64_t sum_size ; /* sum(object_size) for all objects in this bin */
128
+ uint64_t sum_disk_size ; /* sum(on_disk_size) for all objects in this bin */
129
+ uint32_t cnt_seen ; /* number seen in this bin */
130
+ };
131
+
132
+ static void incr_obj_hist_bin (struct obj_hist_bin * pbin ,
133
+ unsigned long object_length ,
134
+ off_t disk_sizep )
135
+ {
136
+ pbin -> sum_size += object_length ;
137
+ pbin -> sum_disk_size += disk_sizep ;
138
+ pbin -> cnt_seen ++ ;
139
+ }
140
+
141
+ /*
142
+ * Common fields for any type of object.
143
+ */
144
+ struct survey_stats_base_object {
145
+ uint32_t cnt_seen ;
146
+
147
+ uint32_t cnt_missing ; /* we may have a partial clone. */
148
+
149
+ /*
150
+ * Number of objects grouped by where they are stored on disk.
151
+ * This is a function of how the ODB is packed.
152
+ */
153
+ uint32_t cnt_cached ; /* see oi.whence */
154
+ uint32_t cnt_loose ; /* see oi.whence */
155
+ uint32_t cnt_packed ; /* see oi.whence */
156
+ uint32_t cnt_dbcached ; /* see oi.whence */
157
+
158
+ uint64_t sum_size ; /* sum(object_size) */
159
+ uint64_t sum_disk_size ; /* sum(disk_size) */
160
+
161
+ /*
162
+ * A histogram of the count of objects, the observed size, and
163
+ * the on-disk size grouped by the observed size.
164
+ */
165
+ struct obj_hist_bin size_hbin [HBIN_LEN ];
166
+ };
167
+
168
+ /*
169
+ * PBIN -- parent vector binning (histogram bucketing).
170
+ *
171
+ * We create a histogram based upon the number of parents
172
+ * in a commit. This is a simple linear vector. It starts
173
+ * at zero for "initial" commits.
174
+ *
175
+ * If a commit has more parents, just put it in the last bin.
176
+ */
177
+ #define PBIN_VEC_LEN (32)
178
+
179
+ struct survey_stats_commits {
180
+ struct survey_stats_base_object base ;
181
+
182
+ /*
183
+ * Count of commits with k parents.
184
+ */
185
+ uint32_t parent_cnt_pbin [PBIN_VEC_LEN ];
186
+ };
187
+
188
+ /*
189
+ * Stats for reachable trees.
190
+ */
191
+ struct survey_stats_trees {
192
+ struct survey_stats_base_object base ;
193
+
194
+ /*
195
+ * In the following, nr_entries refers to the number of files or
196
+ * subdirectories in a tree. We are interested in how wide the
197
+ * tree is and if the repo has gigantic directories.
198
+ */
199
+ uint64_t max_entries ; /* max(nr_entries) -- the width of the largest tree */
200
+
201
+ /*
202
+ * Computing the sum of the number of entries across all trees
203
+ * is probably not that interesting.
204
+ */
205
+ uint64_t sum_entries ; /* sum(nr_entries) -- sum across all trees */
206
+
207
+ /*
208
+ * A histogram of the count of trees, the observed size, and
209
+ * the on-disk size grouped by the number of entries in the tree.
210
+ */
211
+ struct obj_hist_bin entry_qbin [QBIN_LEN ];
212
+ };
213
+
214
+ /*
215
+ * Stats for reachable blobs.
216
+ */
217
+ struct survey_stats_blobs {
218
+ struct survey_stats_base_object base ;
219
+ };
220
+
72
221
struct survey_report_object_summary {
73
222
size_t commits_nr ;
74
223
size_t tags_nr ;
75
224
size_t trees_nr ;
76
225
size_t blobs_nr ;
226
+
227
+ struct survey_stats_commits commits ;
228
+ struct survey_stats_trees trees ;
229
+ struct survey_stats_blobs blobs ;
77
230
};
78
231
79
232
/**
@@ -363,6 +516,98 @@ static void print_table_plaintext(struct survey_table *table)
363
516
free (column_widths );
364
517
}
365
518
519
+ static void pretty_print_bin_table (const char * title_caption ,
520
+ const char * bucket_header ,
521
+ struct obj_hist_bin * bin ,
522
+ uint64_t bin_len , int bin_shift , uint64_t bin_mask )
523
+ {
524
+ struct survey_table table = SURVEY_TABLE_INIT ;
525
+ struct strbuf bucket = STRBUF_INIT , cnt_seen = STRBUF_INIT ;
526
+ struct strbuf sum_size = STRBUF_INIT , sum_disk_size = STRBUF_INIT ;
527
+ uint64_t lower = 0 ;
528
+ uint64_t upper = bin_mask ;
529
+
530
+ table .table_name = title_caption ;
531
+ strvec_pushl (& table .header , bucket_header , "Count" , "Size" , "Disk Size" , NULL );
532
+
533
+ for (int k = 0 ; k < bin_len ; k ++ ) {
534
+ struct obj_hist_bin * p = bin + k ;
535
+ uintmax_t lower_k = lower ;
536
+ uintmax_t upper_k = upper ;
537
+
538
+ lower = upper + 1 ;
539
+ upper = (upper << bin_shift ) + bin_mask ;
540
+
541
+ if (!p -> cnt_seen )
542
+ continue ;
543
+
544
+ strbuf_reset (& bucket );
545
+ strbuf_addf (& bucket , "%" PRIuMAX "..%" PRIuMAX , lower_k , upper_k );
546
+
547
+ strbuf_reset (& cnt_seen );
548
+ strbuf_addf (& cnt_seen , "%" PRIuMAX , (uintmax_t )p -> cnt_seen );
549
+
550
+ strbuf_reset (& sum_size );
551
+ strbuf_addf (& sum_size , "%" PRIuMAX , (uintmax_t )p -> sum_size );
552
+
553
+ strbuf_reset (& sum_disk_size );
554
+ strbuf_addf (& sum_disk_size , "%" PRIuMAX , (uintmax_t )p -> sum_disk_size );
555
+
556
+ insert_table_rowv (& table , bucket .buf ,
557
+ cnt_seen .buf , sum_size .buf , sum_disk_size .buf , NULL );
558
+ }
559
+ strbuf_release (& bucket );
560
+ strbuf_release (& cnt_seen );
561
+ strbuf_release (& sum_size );
562
+ strbuf_release (& sum_disk_size );
563
+
564
+ print_table_plaintext (& table );
565
+ clear_table (& table );
566
+ }
567
+
568
+ static void survey_report_hbin (const char * title_caption ,
569
+ struct obj_hist_bin * bin )
570
+ {
571
+ pretty_print_bin_table (title_caption ,
572
+ "Byte Range" ,
573
+ bin ,
574
+ HBIN_LEN , HBIN_SHIFT , HBIN_MASK );
575
+ }
576
+
577
+ static void survey_report_tree_lengths (struct survey_context * ctx )
578
+ {
579
+ pretty_print_bin_table (_ ("TREE HISTOGRAM BY NUMBER OF ENTRIES" ),
580
+ "Entry Range" ,
581
+ ctx -> report .reachable_objects .trees .entry_qbin ,
582
+ QBIN_LEN , QBIN_SHIFT , QBIN_MASK );
583
+ }
584
+
585
+ static void survey_report_commit_parents (struct survey_context * ctx )
586
+ {
587
+ struct survey_stats_commits * psc = & ctx -> report .reachable_objects .commits ;
588
+ struct survey_table table = SURVEY_TABLE_INIT ;
589
+ struct strbuf parents = STRBUF_INIT , counts = STRBUF_INIT ;
590
+
591
+ table .table_name = _ ("HISTOGRAM BY NUMBER OF COMMIT PARENTS" );
592
+ strvec_pushl (& table .header , "Parents" , "Counts" , NULL );
593
+
594
+ for (int k = 0 ; k < PBIN_VEC_LEN ; k ++ )
595
+ if (psc -> parent_cnt_pbin [k ]) {
596
+ strbuf_reset (& parents );
597
+ strbuf_addf (& parents , "%02d" , k );
598
+
599
+ strbuf_reset (& counts );
600
+ strbuf_addf (& counts , "%14" PRIuMAX , (uintmax_t )psc -> parent_cnt_pbin [k ]);
601
+
602
+ insert_table_rowv (& table , parents .buf , counts .buf , NULL );
603
+ }
604
+ strbuf_release (& parents );
605
+ strbuf_release (& counts );
606
+
607
+ print_table_plaintext (& table );
608
+ clear_table (& table );
609
+ }
610
+
366
611
static void survey_report_plaintext_refs (struct survey_context * ctx )
367
612
{
368
613
struct survey_report_ref_summary * refs = & ctx -> report .refs ;
@@ -515,6 +760,19 @@ static void survey_report_plaintext(struct survey_context *ctx)
515
760
ctx -> report .by_type ,
516
761
REPORT_TYPE_COUNT );
517
762
763
+ survey_report_commit_parents (ctx );
764
+
765
+ survey_report_hbin (_ ("COMMITS HISTOGRAM BY SIZE IN BYTES" ),
766
+ ctx -> report .reachable_objects .commits .base .size_hbin );
767
+
768
+ survey_report_tree_lengths (ctx );
769
+
770
+ survey_report_hbin (_ ("TREES HISTOGRAM BY SIZE IN BYTES" ),
771
+ ctx -> report .reachable_objects .trees .base .size_hbin );
772
+
773
+ survey_report_hbin (_ ("BLOBS HISTOGRAM BY SIZE IN BYTES" ),
774
+ ctx -> report .reachable_objects .blobs .base .size_hbin );
775
+
518
776
survey_report_plaintext_sorted_size (
519
777
& ctx -> report .top_paths_by_count [REPORT_TYPE_TREE ]);
520
778
survey_report_plaintext_sorted_size (
@@ -783,6 +1041,8 @@ static void increment_totals(struct survey_context *ctx,
783
1041
unsigned long object_length = 0 ;
784
1042
off_t disk_sizep = 0 ;
785
1043
enum object_type type ;
1044
+ struct survey_stats_base_object * base ;
1045
+ int hb ;
786
1046
787
1047
oi .typep = & type ;
788
1048
oi .sizep = & object_length ;
@@ -791,11 +1051,81 @@ static void increment_totals(struct survey_context *ctx,
791
1051
if (oid_object_info_extended (ctx -> repo , & oids -> oid [i ],
792
1052
& oi , oi_flags ) < 0 ) {
793
1053
summary -> num_missing ++ ;
794
- } else {
795
- summary -> nr ++ ;
796
- summary -> disk_size += disk_sizep ;
797
- summary -> inflated_size += object_length ;
1054
+ continue ;
1055
+ }
1056
+
1057
+ summary -> nr ++ ;
1058
+ summary -> disk_size += disk_sizep ;
1059
+ summary -> inflated_size += object_length ;
1060
+
1061
+ switch (type ) {
1062
+ case OBJ_COMMIT : {
1063
+ struct commit * commit = lookup_commit (ctx -> repo , & oids -> oid [i ]);
1064
+ unsigned k = commit_list_count (commit -> parents );
1065
+
1066
+ if (k >= PBIN_VEC_LEN )
1067
+ k = PBIN_VEC_LEN - 1 ;
1068
+
1069
+ ctx -> report .reachable_objects .commits .parent_cnt_pbin [k ]++ ;
1070
+ base = & ctx -> report .reachable_objects .commits .base ;
1071
+ break ;
798
1072
}
1073
+ case OBJ_TREE : {
1074
+ struct tree * tree = lookup_tree (ctx -> repo , & oids -> oid [i ]);
1075
+ if (tree ) {
1076
+ struct survey_stats_trees * pst = & ctx -> report .reachable_objects .trees ;
1077
+ struct tree_desc desc ;
1078
+ struct name_entry entry ;
1079
+ int nr_entries ;
1080
+ int qb ;
1081
+
1082
+ parse_tree (tree );
1083
+ init_tree_desc (& desc , & oids -> oid [i ], tree -> buffer , tree -> size );
1084
+ nr_entries = 0 ;
1085
+ while (tree_entry (& desc , & entry ))
1086
+ nr_entries ++ ;
1087
+
1088
+ pst -> sum_entries += nr_entries ;
1089
+
1090
+ if (nr_entries > pst -> max_entries )
1091
+ pst -> max_entries = nr_entries ;
1092
+
1093
+ qb = qbin (nr_entries );
1094
+ incr_obj_hist_bin (& pst -> entry_qbin [qb ], object_length , disk_sizep );
1095
+ }
1096
+ base = & ctx -> report .reachable_objects .trees .base ;
1097
+ break ;
1098
+ }
1099
+ case OBJ_BLOB :
1100
+ base = & ctx -> report .reachable_objects .blobs .base ;
1101
+ break ;
1102
+ default :
1103
+ continue ;
1104
+ }
1105
+
1106
+ switch (oi .whence ) {
1107
+ case OI_CACHED :
1108
+ base -> cnt_cached ++ ;
1109
+ break ;
1110
+ case OI_LOOSE :
1111
+ base -> cnt_loose ++ ;
1112
+ break ;
1113
+ case OI_PACKED :
1114
+ base -> cnt_packed ++ ;
1115
+ break ;
1116
+ case OI_DBCACHED :
1117
+ base -> cnt_dbcached ++ ;
1118
+ break ;
1119
+ default :
1120
+ break ;
1121
+ }
1122
+
1123
+ base -> sum_size += object_length ;
1124
+ base -> sum_disk_size += disk_sizep ;
1125
+
1126
+ hb = hbin (object_length );
1127
+ incr_obj_hist_bin (& base -> size_hbin [hb ], object_length , disk_sizep );
1128
+
799
1129
}
800
1130
}
801
1131
0 commit comments