1616#include "strbuf.h"
1717#include "strvec.h"
1818#include "trace2.h"
19+ #include "tree.h"
20+ #include "tree-walk.h"
1921
2022static const char * const survey_usage [] = {
2123 N_ ("(EXPERIMENTAL!) git survey <options>" ),
@@ -69,11 +71,162 @@ struct survey_report_ref_summary {
6971 size_t len_sum_remote_refnames ;
7072};
7173
74+ /*
75+ * HBIN -- hex binning (histogram bucketing).
76+ *
77+ * We create histograms for various counts and sums. Since we have a
78+ * wide range of values (objects range in size from 1 to 4G bytes), a
79+ * linear bucketing is not interesting. Instead, lets use a
80+ * log16()-based bucketing. This gives us a better spread on the low
81+ * and middle range and a coarse bucketing on the high end.
82+ *
83+ * The idea here is that it doesn't matter if you have n 1GB blobs or
84+ * n/2 1GB blobs and n/2 1.5GB blobs -- either way you have a scaling
85+ * problem that we want to report on.
86+ */
87+ #define HBIN_LEN (sizeof(unsigned long) * 2)
88+ #define HBIN_MASK (0xF)
89+ #define HBIN_SHIFT (4)
90+
91+ static int hbin (unsigned long value )
92+ {
93+ for (size_t k = 0 ; k < HBIN_LEN ; k ++ ) {
94+ if ((value & ~(HBIN_MASK )) == 0 )
95+ return k ;
96+ value >>= HBIN_SHIFT ;
97+ }
98+
99+ return 0 ; /* should not happen */
100+ }
101+
102+ /*
103+ * QBIN -- base4 binning (histogram bucketing).
104+ *
105+ * This is the same idea as the above, but we want better granularity
106+ * in the low end and don't expect as many large values.
107+ */
108+ #define QBIN_LEN (sizeof(unsigned long) * 4)
109+ #define QBIN_MASK (0x3)
110+ #define QBIN_SHIFT (2)
111+
112+ static int qbin (unsigned long value )
113+ {
114+ for (size_t k = 0 ; k < QBIN_LEN ; k ++ ) {
115+ if ((value & ~(QBIN_MASK )) == 0 )
116+ return k ;
117+ value >>= (QBIN_SHIFT );
118+ }
119+
120+ return 0 ; /* should not happen */
121+ }
122+
123+ /*
124+ * histogram bin for objects.
125+ */
126+ struct obj_hist_bin {
127+ uint64_t sum_size ; /* sum(object_size) for all objects in this bin */
128+ uint64_t sum_disk_size ; /* sum(on_disk_size) for all objects in this bin */
129+ uint32_t cnt_seen ; /* number seen in this bin */
130+ };
131+
132+ static void incr_obj_hist_bin (struct obj_hist_bin * pbin ,
133+ unsigned long object_length ,
134+ off_t disk_sizep )
135+ {
136+ pbin -> sum_size += object_length ;
137+ pbin -> sum_disk_size += disk_sizep ;
138+ pbin -> cnt_seen ++ ;
139+ }
140+
141+ /*
142+ * Common fields for any type of object.
143+ */
144+ struct survey_stats_base_object {
145+ uint32_t cnt_seen ;
146+
147+ uint32_t cnt_missing ; /* we may have a partial clone. */
148+
149+ /*
150+ * Number of objects grouped by where they are stored on disk.
151+ * This is a function of how the ODB is packed.
152+ */
153+ uint32_t cnt_cached ; /* see oi.whence */
154+ uint32_t cnt_loose ; /* see oi.whence */
155+ uint32_t cnt_packed ; /* see oi.whence */
156+ uint32_t cnt_dbcached ; /* see oi.whence */
157+
158+ uint64_t sum_size ; /* sum(object_size) */
159+ uint64_t sum_disk_size ; /* sum(disk_size) */
160+
161+ /*
162+ * A histogram of the count of objects, the observed size, and
163+ * the on-disk size grouped by the observed size.
164+ */
165+ struct obj_hist_bin size_hbin [HBIN_LEN ];
166+ };
167+
168+ /*
169+ * PBIN -- parent vector binning (histogram bucketing).
170+ *
171+ * We create a histogram based upon the number of parents
172+ * in a commit. This is a simple linear vector. It starts
173+ * at zero for "initial" commits.
174+ *
175+ * If a commit has more parents, just put it in the last bin.
176+ */
177+ #define PBIN_VEC_LEN (32)
178+
179+ struct survey_stats_commits {
180+ struct survey_stats_base_object base ;
181+
182+ /*
183+ * Count of commits with k parents.
184+ */
185+ uint32_t parent_cnt_pbin [PBIN_VEC_LEN ];
186+ };
187+
188+ /*
189+ * Stats for reachable trees.
190+ */
191+ struct survey_stats_trees {
192+ struct survey_stats_base_object base ;
193+
194+ /*
195+ * In the following, nr_entries refers to the number of files or
196+ * subdirectories in a tree. We are interested in how wide the
197+ * tree is and if the repo has gigantic directories.
198+ */
199+ uint64_t max_entries ; /* max(nr_entries) -- the width of the largest tree */
200+
201+ /*
202+ * Computing the sum of the number of entries across all trees
203+ * is probably not that interesting.
204+ */
205+ uint64_t sum_entries ; /* sum(nr_entries) -- sum across all trees */
206+
207+ /*
208+ * A histogram of the count of trees, the observed size, and
209+ * the on-disk size grouped by the number of entries in the tree.
210+ */
211+ struct obj_hist_bin entry_qbin [QBIN_LEN ];
212+ };
213+
214+ /*
215+ * Stats for reachable blobs.
216+ */
217+ struct survey_stats_blobs {
218+ struct survey_stats_base_object base ;
219+ };
220+
72221struct survey_report_object_summary {
73222 size_t commits_nr ;
74223 size_t tags_nr ;
75224 size_t trees_nr ;
76225 size_t blobs_nr ;
226+
227+ struct survey_stats_commits commits ;
228+ struct survey_stats_trees trees ;
229+ struct survey_stats_blobs blobs ;
77230};
78231
79232/**
@@ -363,6 +516,98 @@ static void print_table_plaintext(struct survey_table *table)
363516 free (column_widths );
364517}
365518
519+ static void pretty_print_bin_table (const char * title_caption ,
520+ const char * bucket_header ,
521+ struct obj_hist_bin * bin ,
522+ uint64_t bin_len , int bin_shift , uint64_t bin_mask )
523+ {
524+ struct survey_table table = SURVEY_TABLE_INIT ;
525+ struct strbuf bucket = STRBUF_INIT , cnt_seen = STRBUF_INIT ;
526+ struct strbuf sum_size = STRBUF_INIT , sum_disk_size = STRBUF_INIT ;
527+ uint64_t lower = 0 ;
528+ uint64_t upper = bin_mask ;
529+
530+ table .table_name = title_caption ;
531+ strvec_pushl (& table .header , bucket_header , "Count" , "Size" , "Disk Size" , NULL );
532+
533+ for (size_t k = 0 ; k < bin_len ; k ++ ) {
534+ struct obj_hist_bin * p = bin + k ;
535+ uintmax_t lower_k = lower ;
536+ uintmax_t upper_k = upper ;
537+
538+ lower = upper + 1 ;
539+ upper = (upper << bin_shift ) + bin_mask ;
540+
541+ if (!p -> cnt_seen )
542+ continue ;
543+
544+ strbuf_reset (& bucket );
545+ strbuf_addf (& bucket , "%" PRIuMAX "..%" PRIuMAX , lower_k , upper_k );
546+
547+ strbuf_reset (& cnt_seen );
548+ strbuf_addf (& cnt_seen , "%" PRIuMAX , (uintmax_t )p -> cnt_seen );
549+
550+ strbuf_reset (& sum_size );
551+ strbuf_addf (& sum_size , "%" PRIuMAX , (uintmax_t )p -> sum_size );
552+
553+ strbuf_reset (& sum_disk_size );
554+ strbuf_addf (& sum_disk_size , "%" PRIuMAX , (uintmax_t )p -> sum_disk_size );
555+
556+ insert_table_rowv (& table , bucket .buf ,
557+ cnt_seen .buf , sum_size .buf , sum_disk_size .buf , NULL );
558+ }
559+ strbuf_release (& bucket );
560+ strbuf_release (& cnt_seen );
561+ strbuf_release (& sum_size );
562+ strbuf_release (& sum_disk_size );
563+
564+ print_table_plaintext (& table );
565+ clear_table (& table );
566+ }
567+
568+ static void survey_report_hbin (const char * title_caption ,
569+ struct obj_hist_bin * bin )
570+ {
571+ pretty_print_bin_table (title_caption ,
572+ "Byte Range" ,
573+ bin ,
574+ HBIN_LEN , HBIN_SHIFT , HBIN_MASK );
575+ }
576+
577+ static void survey_report_tree_lengths (struct survey_context * ctx )
578+ {
579+ pretty_print_bin_table (_ ("TREE HISTOGRAM BY NUMBER OF ENTRIES" ),
580+ "Entry Range" ,
581+ ctx -> report .reachable_objects .trees .entry_qbin ,
582+ QBIN_LEN , QBIN_SHIFT , QBIN_MASK );
583+ }
584+
585+ static void survey_report_commit_parents (struct survey_context * ctx )
586+ {
587+ struct survey_stats_commits * psc = & ctx -> report .reachable_objects .commits ;
588+ struct survey_table table = SURVEY_TABLE_INIT ;
589+ struct strbuf parents = STRBUF_INIT , counts = STRBUF_INIT ;
590+
591+ table .table_name = _ ("HISTOGRAM BY NUMBER OF COMMIT PARENTS" );
592+ strvec_pushl (& table .header , "Parents" , "Counts" , NULL );
593+
594+ for (int k = 0 ; k < PBIN_VEC_LEN ; k ++ )
595+ if (psc -> parent_cnt_pbin [k ]) {
596+ strbuf_reset (& parents );
597+ strbuf_addf (& parents , "%02d" , k );
598+
599+ strbuf_reset (& counts );
600+ strbuf_addf (& counts , "%14" PRIuMAX , (uintmax_t )psc -> parent_cnt_pbin [k ]);
601+
602+ insert_table_rowv (& table , parents .buf , counts .buf , NULL );
603+ }
604+ strbuf_release (& parents );
605+ strbuf_release (& counts );
606+
607+ print_table_plaintext (& table );
608+ clear_table (& table );
609+ }
610+
366611static void survey_report_plaintext_refs (struct survey_context * ctx )
367612{
368613 struct survey_report_ref_summary * refs = & ctx -> report .refs ;
@@ -515,6 +760,19 @@ static void survey_report_plaintext(struct survey_context *ctx)
515760 ctx -> report .by_type ,
516761 REPORT_TYPE_COUNT );
517762
763+ survey_report_commit_parents (ctx );
764+
765+ survey_report_hbin (_ ("COMMITS HISTOGRAM BY SIZE IN BYTES" ),
766+ ctx -> report .reachable_objects .commits .base .size_hbin );
767+
768+ survey_report_tree_lengths (ctx );
769+
770+ survey_report_hbin (_ ("TREES HISTOGRAM BY SIZE IN BYTES" ),
771+ ctx -> report .reachable_objects .trees .base .size_hbin );
772+
773+ survey_report_hbin (_ ("BLOBS HISTOGRAM BY SIZE IN BYTES" ),
774+ ctx -> report .reachable_objects .blobs .base .size_hbin );
775+
518776 survey_report_plaintext_sorted_size (
519777 & ctx -> report .top_paths_by_count [REPORT_TYPE_TREE ]);
520778 survey_report_plaintext_sorted_size (
@@ -783,6 +1041,8 @@ static void increment_totals(struct survey_context *ctx,
7831041 unsigned long object_length = 0 ;
7841042 off_t disk_sizep = 0 ;
7851043 enum object_type type ;
1044+ struct survey_stats_base_object * base ;
1045+ int hb ;
7861046
7871047 oi .typep = & type ;
7881048 oi .sizep = & object_length ;
@@ -791,11 +1051,81 @@ static void increment_totals(struct survey_context *ctx,
7911051 if (oid_object_info_extended (ctx -> repo , & oids -> oid [i ],
7921052 & oi , oi_flags ) < 0 ) {
7931053 summary -> num_missing ++ ;
794- } else {
795- summary -> nr ++ ;
796- summary -> disk_size += disk_sizep ;
797- summary -> inflated_size += object_length ;
1054+ continue ;
1055+ }
1056+
1057+ summary -> nr ++ ;
1058+ summary -> disk_size += disk_sizep ;
1059+ summary -> inflated_size += object_length ;
1060+
1061+ switch (type ) {
1062+ case OBJ_COMMIT : {
1063+ struct commit * commit = lookup_commit (ctx -> repo , & oids -> oid [i ]);
1064+ unsigned k = commit_list_count (commit -> parents );
1065+
1066+ if (k >= PBIN_VEC_LEN )
1067+ k = PBIN_VEC_LEN - 1 ;
1068+
1069+ ctx -> report .reachable_objects .commits .parent_cnt_pbin [k ]++ ;
1070+ base = & ctx -> report .reachable_objects .commits .base ;
1071+ break ;
7981072 }
1073+ case OBJ_TREE : {
1074+ struct tree * tree = lookup_tree (ctx -> repo , & oids -> oid [i ]);
1075+ if (tree ) {
1076+ struct survey_stats_trees * pst = & ctx -> report .reachable_objects .trees ;
1077+ struct tree_desc desc ;
1078+ struct name_entry entry ;
1079+ int nr_entries ;
1080+ int qb ;
1081+
1082+ parse_tree (tree );
1083+ init_tree_desc (& desc , & oids -> oid [i ], tree -> buffer , tree -> size );
1084+ nr_entries = 0 ;
1085+ while (tree_entry (& desc , & entry ))
1086+ nr_entries ++ ;
1087+
1088+ pst -> sum_entries += nr_entries ;
1089+
1090+ if (nr_entries > pst -> max_entries )
1091+ pst -> max_entries = nr_entries ;
1092+
1093+ qb = qbin (nr_entries );
1094+ incr_obj_hist_bin (& pst -> entry_qbin [qb ], object_length , disk_sizep );
1095+ }
1096+ base = & ctx -> report .reachable_objects .trees .base ;
1097+ break ;
1098+ }
1099+ case OBJ_BLOB :
1100+ base = & ctx -> report .reachable_objects .blobs .base ;
1101+ break ;
1102+ default :
1103+ continue ;
1104+ }
1105+
1106+ switch (oi .whence ) {
1107+ case OI_CACHED :
1108+ base -> cnt_cached ++ ;
1109+ break ;
1110+ case OI_LOOSE :
1111+ base -> cnt_loose ++ ;
1112+ break ;
1113+ case OI_PACKED :
1114+ base -> cnt_packed ++ ;
1115+ break ;
1116+ case OI_DBCACHED :
1117+ base -> cnt_dbcached ++ ;
1118+ break ;
1119+ default :
1120+ break ;
1121+ }
1122+
1123+ base -> sum_size += object_length ;
1124+ base -> sum_disk_size += disk_sizep ;
1125+
1126+ hb = hbin (object_length );
1127+ incr_obj_hist_bin (& base -> size_hbin [hb ], object_length , disk_sizep );
1128+
7991129 }
8001130}
8011131
0 commit comments