1616#include "strbuf.h"
1717#include "strvec.h"
1818#include "trace2.h"
19+ #include "tree.h"
20+ #include "tree-walk.h"
1921
2022static const char * const survey_usage [] = {
2123 N_ ("(EXPERIMENTAL!) git survey <options>" ),
@@ -69,11 +71,162 @@ struct survey_report_ref_summary {
6971 size_t len_sum_remote_refnames ;
7072};
7173
74+ /*
75+ * HBIN -- hex binning (histogram bucketing).
76+ *
77+ * We create histograms for various counts and sums. Since we have a
78+ * wide range of values (objects range in size from 1 to 4G bytes), a
79+ * linear bucketing is not interesting. Instead, lets use a
80+ * log16()-based bucketing. This gives us a better spread on the low
81+ * and middle range and a coarse bucketing on the high end.
82+ *
83+ * The idea here is that it doesn't matter if you have n 1GB blobs or
84+ * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling
85+ * problem that we want to report on.
86+ */
87+ #define HBIN_LEN (sizeof(unsigned long) * 2)
88+ #define HBIN_MASK (0xF)
89+ #define HBIN_SHIFT (4)
90+
91+ static int hbin (unsigned long value )
92+ {
93+ for (size_t k = 0 ; k < HBIN_LEN ; k ++ ) {
94+ if ((value & ~(HBIN_MASK )) == 0 )
95+ return k ;
96+ value >>= HBIN_SHIFT ;
97+ }
98+
99+ return 0 ; /* should not happen */
100+ }
101+
102+ /*
103+ * QBIN -- base4 binning (histogram bucketing).
104+ *
105+ * This is the same idea as the above, but we want better granularity
106+ * in the low end and don't expect as many large values.
107+ */
108+ #define QBIN_LEN (sizeof(unsigned long) * 4)
109+ #define QBIN_MASK (0x3)
110+ #define QBIN_SHIFT (2)
111+
112+ static int qbin (unsigned long value )
113+ {
114+ for (size_t k = 0 ; k < QBIN_LEN ; k ++ ) {
115+ if ((value & ~(QBIN_MASK )) == 0 )
116+ return k ;
117+ value >>= (QBIN_SHIFT );
118+ }
119+
120+ return 0 ; /* should not happen */
121+ }
122+
123+ /*
124+ * histogram bin for objects.
125+ */
126+ struct obj_hist_bin {
127+ uint64_t sum_size ; /* sum(object_size) for all objects in this bin */
128+ uint64_t sum_disk_size ; /* sum(on_disk_size) for all objects in this bin */
129+ uint32_t cnt_seen ; /* number seen in this bin */
130+ };
131+
132+ static void incr_obj_hist_bin (struct obj_hist_bin * pbin ,
133+ unsigned long object_length ,
134+ off_t disk_sizep )
135+ {
136+ pbin -> sum_size += object_length ;
137+ pbin -> sum_disk_size += disk_sizep ;
138+ pbin -> cnt_seen ++ ;
139+ }
140+
141+ /*
142+ * Common fields for any type of object.
143+ */
144+ struct survey_stats_base_object {
145+ uint32_t cnt_seen ;
146+
147+ uint32_t cnt_missing ; /* we may have a partial clone. */
148+
149+ /*
150+ * Number of objects grouped by where they are stored on disk.
151+ * This is a function of how the ODB is packed.
152+ */
153+ uint32_t cnt_cached ; /* see oi.whence */
154+ uint32_t cnt_loose ; /* see oi.whence */
155+ uint32_t cnt_packed ; /* see oi.whence */
156+ uint32_t cnt_dbcached ; /* see oi.whence */
157+
158+ uint64_t sum_size ; /* sum(object_size) */
159+ uint64_t sum_disk_size ; /* sum(disk_size) */
160+
161+ /*
162+ * A histogram of the count of objects, the observed size, and
163+ * the on-disk size grouped by the observed size.
164+ */
165+ struct obj_hist_bin size_hbin [HBIN_LEN ];
166+ };
167+
168+ /*
169+ * PBIN -- parent vector binning (histogram bucketing).
170+ *
171+ * We create a histogram based upon the number of parents
172+ * in a commit. This is a simple linear vector. It starts
173+ * at zero for "initial" commits.
174+ *
175+ * If a commit has more parents, just put it in the last bin.
176+ */
177+ #define PBIN_VEC_LEN (32)
178+
179+ struct survey_stats_commits {
180+ struct survey_stats_base_object base ;
181+
182+ /*
183+ * Count of commits with k parents.
184+ */
185+ uint32_t parent_cnt_pbin [PBIN_VEC_LEN ];
186+ };
187+
188+ /*
189+ * Stats for reachable trees.
190+ */
191+ struct survey_stats_trees {
192+ struct survey_stats_base_object base ;
193+
194+ /*
195+ * In the following, nr_entries refers to the number of files or
196+ * subdirectories in a tree. We are interested in how wide the
197+ * tree is and if the repo has gigantic directories.
198+ */
199+ uint64_t max_entries ; /* max(nr_entries) -- the width of the largest tree */
200+
201+ /*
202+ * Computing the sum of the number of entries across all trees
203+ * is probably not that interesting.
204+ */
205+ uint64_t sum_entries ; /* sum(nr_entries) -- sum across all trees */
206+
207+ /*
208+ * A histogram of the count of trees, the observed size, and
209+ * the on-disk size grouped by the number of entries in the tree.
210+ */
211+ struct obj_hist_bin entry_qbin [QBIN_LEN ];
212+ };
213+
214+ /*
215+ * Stats for reachable blobs.
216+ */
217+ struct survey_stats_blobs {
218+ struct survey_stats_base_object base ;
219+ };
220+
72221struct survey_report_object_summary {
73222 size_t commits_nr ;
74223 size_t tags_nr ;
75224 size_t trees_nr ;
76225 size_t blobs_nr ;
226+
227+ struct survey_stats_commits commits ;
228+ struct survey_stats_trees trees ;
229+ struct survey_stats_blobs blobs ;
77230};
78231
79232/**
@@ -363,6 +516,98 @@ static void print_table_plaintext(struct survey_table *table)
363516 free (column_widths );
364517}
365518
519+ static void pretty_print_bin_table (const char * title_caption ,
520+ const char * bucket_header ,
521+ struct obj_hist_bin * bin ,
522+ uint64_t bin_len , int bin_shift , uint64_t bin_mask )
523+ {
524+ struct survey_table table = SURVEY_TABLE_INIT ;
525+ struct strbuf bucket = STRBUF_INIT , cnt_seen = STRBUF_INIT ;
526+ struct strbuf sum_size = STRBUF_INIT , sum_disk_size = STRBUF_INIT ;
527+ uint64_t lower = 0 ;
528+ uint64_t upper = bin_mask ;
529+
530+ table .table_name = title_caption ;
531+ strvec_pushl (& table .header , bucket_header , "Count" , "Size" , "Disk Size" , NULL );
532+
533+ for (size_t k = 0 ; k < bin_len ; k ++ ) {
534+ struct obj_hist_bin * p = bin + k ;
535+ uintmax_t lower_k = lower ;
536+ uintmax_t upper_k = upper ;
537+
538+ lower = upper + 1 ;
539+ upper = (upper << bin_shift ) + bin_mask ;
540+
541+ if (!p -> cnt_seen )
542+ continue ;
543+
544+ strbuf_reset (& bucket );
545+ strbuf_addf (& bucket , "%" PRIuMAX "..%" PRIuMAX , lower_k , upper_k );
546+
547+ strbuf_reset (& cnt_seen );
548+ strbuf_addf (& cnt_seen , "%" PRIuMAX , (uintmax_t )p -> cnt_seen );
549+
550+ strbuf_reset (& sum_size );
551+ strbuf_addf (& sum_size , "%" PRIuMAX , (uintmax_t )p -> sum_size );
552+
553+ strbuf_reset (& sum_disk_size );
554+ strbuf_addf (& sum_disk_size , "%" PRIuMAX , (uintmax_t )p -> sum_disk_size );
555+
556+ insert_table_rowv (& table , bucket .buf ,
557+ cnt_seen .buf , sum_size .buf , sum_disk_size .buf , NULL );
558+ }
559+ strbuf_release (& bucket );
560+ strbuf_release (& cnt_seen );
561+ strbuf_release (& sum_size );
562+ strbuf_release (& sum_disk_size );
563+
564+ print_table_plaintext (& table );
565+ clear_table (& table );
566+ }
567+
568+ static void survey_report_hbin (const char * title_caption ,
569+ struct obj_hist_bin * bin )
570+ {
571+ pretty_print_bin_table (title_caption ,
572+ "Byte Range" ,
573+ bin ,
574+ HBIN_LEN , HBIN_SHIFT , HBIN_MASK );
575+ }
576+
577+ static void survey_report_tree_lengths (struct survey_context * ctx )
578+ {
579+ pretty_print_bin_table (_ ("TREE HISTOGRAM BY NUMBER OF ENTRIES" ),
580+ "Entry Range" ,
581+ ctx -> report .reachable_objects .trees .entry_qbin ,
582+ QBIN_LEN , QBIN_SHIFT , QBIN_MASK );
583+ }
584+
585+ static void survey_report_commit_parents (struct survey_context * ctx )
586+ {
587+ struct survey_stats_commits * psc = & ctx -> report .reachable_objects .commits ;
588+ struct survey_table table = SURVEY_TABLE_INIT ;
589+ struct strbuf parents = STRBUF_INIT , counts = STRBUF_INIT ;
590+
591+ table .table_name = _ ("HISTOGRAM BY NUMBER OF COMMIT PARENTS" );
592+ strvec_pushl (& table .header , "Parents" , "Counts" , NULL );
593+
594+ for (int k = 0 ; k < PBIN_VEC_LEN ; k ++ )
595+ if (psc -> parent_cnt_pbin [k ]) {
596+ strbuf_reset (& parents );
597+ strbuf_addf (& parents , "%02d" , k );
598+
599+ strbuf_reset (& counts );
600+ strbuf_addf (& counts , "%14" PRIuMAX , (uintmax_t )psc -> parent_cnt_pbin [k ]);
601+
602+ insert_table_rowv (& table , parents .buf , counts .buf , NULL );
603+ }
604+ strbuf_release (& parents );
605+ strbuf_release (& counts );
606+
607+ print_table_plaintext (& table );
608+ clear_table (& table );
609+ }
610+
366611static void survey_report_plaintext_refs (struct survey_context * ctx )
367612{
368613 struct survey_report_ref_summary * refs = & ctx -> report .refs ;
@@ -515,6 +760,19 @@ static void survey_report_plaintext(struct survey_context *ctx)
515760 ctx -> report .by_type ,
516761 REPORT_TYPE_COUNT );
517762
763+ survey_report_commit_parents (ctx );
764+
765+ survey_report_hbin (_ ("COMMITS HISTOGRAM BY SIZE IN BYTES" ),
766+ ctx -> report .reachable_objects .commits .base .size_hbin );
767+
768+ survey_report_tree_lengths (ctx );
769+
770+ survey_report_hbin (_ ("TREES HISTOGRAM BY SIZE IN BYTES" ),
771+ ctx -> report .reachable_objects .trees .base .size_hbin );
772+
773+ survey_report_hbin (_ ("BLOBS HISTOGRAM BY SIZE IN BYTES" ),
774+ ctx -> report .reachable_objects .blobs .base .size_hbin );
775+
518776 survey_report_plaintext_sorted_size (
519777 & ctx -> report .top_paths_by_count [REPORT_TYPE_TREE ]);
520778 survey_report_plaintext_sorted_size (
@@ -784,6 +1042,8 @@ static void increment_totals(struct survey_context *ctx,
7841042 unsigned long object_length = 0 ;
7851043 off_t disk_sizep = 0 ;
7861044 enum object_type type ;
1045+ struct survey_stats_base_object * base ;
1046+ int hb ;
7871047
7881048 oi .typep = & type ;
7891049 oi .sizep = & object_length ;
@@ -793,11 +1053,81 @@ static void increment_totals(struct survey_context *ctx,
7931053 & oids -> oid [i ],
7941054 & oi , oi_flags ) < 0 ) {
7951055 summary -> num_missing ++ ;
796- } else {
797- summary -> nr ++ ;
798- summary -> disk_size += disk_sizep ;
799- summary -> inflated_size += object_length ;
1056+ continue ;
1057+ }
1058+
1059+ summary -> nr ++ ;
1060+ summary -> disk_size += disk_sizep ;
1061+ summary -> inflated_size += object_length ;
1062+
1063+ switch (type ) {
1064+ case OBJ_COMMIT : {
1065+ struct commit * commit = lookup_commit (ctx -> repo , & oids -> oid [i ]);
1066+ unsigned k = commit_list_count (commit -> parents );
1067+
1068+ if (k >= PBIN_VEC_LEN )
1069+ k = PBIN_VEC_LEN - 1 ;
1070+
1071+ ctx -> report .reachable_objects .commits .parent_cnt_pbin [k ]++ ;
1072+ base = & ctx -> report .reachable_objects .commits .base ;
1073+ break ;
8001074 }
1075+ case OBJ_TREE : {
1076+ struct tree * tree = lookup_tree (ctx -> repo , & oids -> oid [i ]);
1077+ if (tree ) {
1078+ struct survey_stats_trees * pst = & ctx -> report .reachable_objects .trees ;
1079+ struct tree_desc desc ;
1080+ struct name_entry entry ;
1081+ uint64_t nr_entries ;
1082+ int qb ;
1083+
1084+ parse_tree (tree );
1085+ init_tree_desc (& desc , & oids -> oid [i ], tree -> buffer , tree -> size );
1086+ nr_entries = 0 ;
1087+ while (tree_entry (& desc , & entry ))
1088+ nr_entries ++ ;
1089+
1090+ pst -> sum_entries += nr_entries ;
1091+
1092+ if (nr_entries > pst -> max_entries )
1093+ pst -> max_entries = nr_entries ;
1094+
1095+ qb = qbin (nr_entries );
1096+ incr_obj_hist_bin (& pst -> entry_qbin [qb ], object_length , disk_sizep );
1097+ }
1098+ base = & ctx -> report .reachable_objects .trees .base ;
1099+ break ;
1100+ }
1101+ case OBJ_BLOB :
1102+ base = & ctx -> report .reachable_objects .blobs .base ;
1103+ break ;
1104+ default :
1105+ continue ;
1106+ }
1107+
1108+ switch (oi .whence ) {
1109+ case OI_CACHED :
1110+ base -> cnt_cached ++ ;
1111+ break ;
1112+ case OI_LOOSE :
1113+ base -> cnt_loose ++ ;
1114+ break ;
1115+ case OI_PACKED :
1116+ base -> cnt_packed ++ ;
1117+ break ;
1118+ case OI_DBCACHED :
1119+ base -> cnt_dbcached ++ ;
1120+ break ;
1121+ default :
1122+ break ;
1123+ }
1124+
1125+ base -> sum_size += object_length ;
1126+ base -> sum_disk_size += disk_sizep ;
1127+
1128+ hb = hbin (object_length );
1129+ incr_obj_hist_bin (& base -> size_hbin [hb ], object_length , disk_sizep );
1130+
8011131 }
8021132}
8031133
0 commit comments