Skip to content

Commit 7c5095d

Browse files
derrickstoleedscho
authored andcommitted
survey: add report of "largest" paths
Since we are already walking our reachable objects using the path-walk API, let's now collect lists of the paths that contribute most to different metrics. Specifically, we care about * Number of versions. * Total size on disk. * Total inflated size (no delta or zlib compression). This information can be critical to discovering which parts of the repository are causing the most growth, especially on-disk size. Different packing strategies might help compress data more efficiently, but the toal inflated size is a representation of the raw size of all snapshots of those paths. Even when stored efficiently on disk, that size represents how much information must be processed to complete a command such as 'git blame'. The exact disk size seems to be not quite robust enough for testing, as could be seen by the `linux-musl-meson` job consistently failing, possibly because of zlib-ng deflates differently: t8100.4(git survey (default)) was failing with a symptom like this: TOTAL OBJECT SIZES BY TYPE =============================================== Object Type | Count | Disk Size | Inflated Size ------------+-------+-----------+-------------- - Commits | 10 | 1523 | 2153 + Commits | 10 | 1528 | 2153 Trees | 10 | 495 | 1706 Blobs | 10 | 191 | 101 - Tags | 4 | 510 | 528 + Tags | 4 | 547 | 528 This means: the disk size is unlikely something we can verify robustly. Since zlib-ng seems to increase the disk size of the tags from 528 to 547, we cannot even assume that the disk size is always smaller than the inflated size. We will most likely want to either skip verifying the disk size altogether, or go for some kind of fuzzy matching, say, by replacing `s/ 1[45][0-9][0-9] / ~1.5k /` and `s/ [45][0-9][0-9] / ~½k /` or something like that. Signed-off-by: Derrick Stolee <stolee@gmail.com> Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
1 parent 036fcf7 commit 7c5095d

File tree

2 files changed

+82
-9
lines changed

2 files changed

+82
-9
lines changed

builtin/survey.c

Lines changed: 71 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,6 @@ struct survey_report_object_size_summary {
7575

7676
typedef int (*survey_top_cmp)(void *v1, void *v2);
7777

78-
MAYBE_UNUSED
7978
static int cmp_by_nr(void *v1, void *v2)
8079
{
8180
struct survey_report_object_size_summary *s1 = v1;
@@ -88,7 +87,6 @@ static int cmp_by_nr(void *v1, void *v2)
8887
return 0;
8988
}
9089

91-
MAYBE_UNUSED
9290
static int cmp_by_disk_size(void *v1, void *v2)
9391
{
9492
struct survey_report_object_size_summary *s1 = v1;
@@ -101,7 +99,6 @@ static int cmp_by_disk_size(void *v1, void *v2)
10199
return 0;
102100
}
103101

104-
MAYBE_UNUSED
105102
static int cmp_by_inflated_size(void *v1, void *v2)
106103
{
107104
struct survey_report_object_size_summary *s1 = v1;
@@ -132,7 +129,6 @@ struct survey_report_top_table {
132129
void *data;
133130
};
134131

135-
MAYBE_UNUSED
136132
static void init_top_sizes(struct survey_report_top_table *top,
137133
size_t limit, const char *name,
138134
survey_top_cmp cmp)
@@ -158,7 +154,6 @@ static void clear_top_sizes(struct survey_report_top_table *top)
158154
free(sz_array);
159155
}
160156

161-
MAYBE_UNUSED
162157
static void maybe_insert_into_top_size(struct survey_report_top_table *top,
163158
struct survey_report_object_size_summary *summary)
164159
{
@@ -195,6 +190,10 @@ struct survey_report {
195190
struct survey_report_object_summary reachable_objects;
196191

197192
struct survey_report_object_size_summary *by_type;
193+
194+
struct survey_report_top_table *top_paths_by_count;
195+
struct survey_report_top_table *top_paths_by_disk;
196+
struct survey_report_top_table *top_paths_by_inflate;
198197
};
199198

200199
#define REPORT_TYPE_COMMIT 0
@@ -446,6 +445,13 @@ static void survey_report_object_sizes(const char *title,
446445
clear_table(&table);
447446
}
448447

448+
static void survey_report_plaintext_sorted_size(
449+
struct survey_report_top_table *top)
450+
{
451+
survey_report_object_sizes(top->name, _("Path"),
452+
top->data, top->nr);
453+
}
454+
449455
static void survey_report_plaintext(struct survey_context *ctx)
450456
{
451457
printf("GIT SURVEY for \"%s\"\n", ctx->repo->worktree);
@@ -456,6 +462,21 @@ static void survey_report_plaintext(struct survey_context *ctx)
456462
_("Object Type"),
457463
ctx->report.by_type,
458464
REPORT_TYPE_COUNT);
465+
466+
survey_report_plaintext_sorted_size(
467+
&ctx->report.top_paths_by_count[REPORT_TYPE_TREE]);
468+
survey_report_plaintext_sorted_size(
469+
&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB]);
470+
471+
survey_report_plaintext_sorted_size(
472+
&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE]);
473+
survey_report_plaintext_sorted_size(
474+
&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB]);
475+
476+
survey_report_plaintext_sorted_size(
477+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE]);
478+
survey_report_plaintext_sorted_size(
479+
&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB]);
459480
}
460481

461482
/*
@@ -698,7 +719,8 @@ static void increment_totals(struct survey_context *ctx,
698719

699720
static void increment_object_totals(struct survey_context *ctx,
700721
struct oid_array *oids,
701-
enum object_type type)
722+
enum object_type type,
723+
const char *path)
702724
{
703725
struct survey_report_object_size_summary *total;
704726
struct survey_report_object_size_summary summary = { 0 };
@@ -730,9 +752,30 @@ static void increment_object_totals(struct survey_context *ctx,
730752
total->disk_size += summary.disk_size;
731753
total->inflated_size += summary.inflated_size;
732754
total->num_missing += summary.num_missing;
755+
756+
if (type == OBJ_TREE || type == OBJ_BLOB) {
757+
int index = type == OBJ_TREE ?
758+
REPORT_TYPE_TREE : REPORT_TYPE_BLOB;
759+
struct survey_report_top_table *top;
760+
761+
/*
762+
* Temporarily store (const char *) here, but it will
763+
* be duped if inserted and will not be freed.
764+
*/
765+
summary.label = (char *)path;
766+
767+
top = ctx->report.top_paths_by_count;
768+
maybe_insert_into_top_size(&top[index], &summary);
769+
770+
top = ctx->report.top_paths_by_disk;
771+
maybe_insert_into_top_size(&top[index], &summary);
772+
773+
top = ctx->report.top_paths_by_inflate;
774+
maybe_insert_into_top_size(&top[index], &summary);
775+
}
733776
}
734777

735-
static int survey_objects_path_walk_fn(const char *path UNUSED,
778+
static int survey_objects_path_walk_fn(const char *path,
736779
struct oid_array *oids,
737780
enum object_type type,
738781
void *data)
@@ -741,7 +784,7 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
741784

742785
increment_object_counts(&ctx->report.reachable_objects,
743786
type, oids->nr);
744-
increment_object_totals(ctx, oids, type);
787+
increment_object_totals(ctx, oids, type, path);
745788

746789
ctx->progress_nr += oids->nr;
747790
display_progress(ctx->progress, ctx->progress_nr);
@@ -751,11 +794,31 @@ static int survey_objects_path_walk_fn(const char *path UNUSED,
751794

752795
static void initialize_report(struct survey_context *ctx)
753796
{
797+
const int top_limit = 100;
798+
754799
CALLOC_ARRAY(ctx->report.by_type, REPORT_TYPE_COUNT);
755800
ctx->report.by_type[REPORT_TYPE_COMMIT].label = xstrdup(_("Commits"));
756801
ctx->report.by_type[REPORT_TYPE_TREE].label = xstrdup(_("Trees"));
757802
ctx->report.by_type[REPORT_TYPE_BLOB].label = xstrdup(_("Blobs"));
758803
ctx->report.by_type[REPORT_TYPE_TAG].label = xstrdup(_("Tags"));
804+
805+
CALLOC_ARRAY(ctx->report.top_paths_by_count, REPORT_TYPE_COUNT);
806+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_TREE],
807+
top_limit, _("TOP DIRECTORIES BY COUNT"), cmp_by_nr);
808+
init_top_sizes(&ctx->report.top_paths_by_count[REPORT_TYPE_BLOB],
809+
top_limit, _("TOP FILES BY COUNT"), cmp_by_nr);
810+
811+
CALLOC_ARRAY(ctx->report.top_paths_by_disk, REPORT_TYPE_COUNT);
812+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_TREE],
813+
top_limit, _("TOP DIRECTORIES BY DISK SIZE"), cmp_by_disk_size);
814+
init_top_sizes(&ctx->report.top_paths_by_disk[REPORT_TYPE_BLOB],
815+
top_limit, _("TOP FILES BY DISK SIZE"), cmp_by_disk_size);
816+
817+
CALLOC_ARRAY(ctx->report.top_paths_by_inflate, REPORT_TYPE_COUNT);
818+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_TREE],
819+
top_limit, _("TOP DIRECTORIES BY INFLATED SIZE"), cmp_by_inflated_size);
820+
init_top_sizes(&ctx->report.top_paths_by_inflate[REPORT_TYPE_BLOB],
821+
top_limit, _("TOP FILES BY INFLATED SIZE"), cmp_by_inflated_size);
759822
}
760823

761824
static void survey_phase_objects(struct survey_context *ctx)

t/t8100-git-survey.sh

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,17 @@ test_expect_success 'git survey (default)' '
9292
EOF
9393
9494
approximate_sizes out >out-edited &&
95-
test_cmp expect out-edited
95+
lines=$(wc -l <expect) &&
96+
head -n "$lines" <out-edited >out-trimmed &&
97+
test_cmp expect out-trimmed &&
98+
99+
for type in "DIRECTORIES" "FILES"
100+
do
101+
for metric in "COUNT" "DISK SIZE" "INFLATED SIZE"
102+
do
103+
grep "TOP $type BY $metric" out || return 1
104+
done || return 1
105+
done
96106
'
97107

98108
test_done

0 commit comments

Comments
 (0)