Skip to content

Commit 11bce2c

Browse files
committed
pack-objects: create new name-hash algorithm (#5157)
This is an updated version of gitgitgadget#1785, intended for early consumption into Git for Windows. The idea here is to add a new `--full-name-hash` option to `git pack-objects` and `git repack`. This adjusts the name-hash value used for finding delta bases in such a way that uses the full path name with a lower likelihood of collisions than the default name-hash algorithm. In many repositories with name-hash collisions and many versions of those paths, this can significantly reduce the size of a full repack. It can also help in certain cases of `git push`, but only if the pack is already artificially inflated by name-hash collisions; cases that find "sibling" deltas as better choices become worse with `--full-name-hash`. Thus, this option is currently recommended for full repacks of large repos, and on client machines without reachability bitmaps. Some care is taken to ignore this option when using bitmaps, either writing bitmaps or using a bitmap walk during reads. The bitmap file format contains name-hash values, but no way to indicate which function is used, so compatibility is a concern for bitmaps. Future work could explore this idea. After this PR is merged, then the more-involved `--path-walk` option may be considered.
2 parents 7f0cdae + 131c260 commit 11bce2c

22 files changed

+311
-13
lines changed

Documentation/git-pack-objects.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ SYNOPSIS
1515
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
1616
[--cruft] [--cruft-expiration=<time>]
1717
[--stdout [--filter=<filter-spec>] | <base-name>]
18-
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
18+
[--shallow] [--keep-true-parents] [--[no-]sparse]
19+
[--full-name-hash] < <object-list>
1920

2021

2122
DESCRIPTION

Documentation/git-repack.txt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
99
SYNOPSIS
1010
--------
1111
[verse]
12-
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
12+
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
13+
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
14+
[--write-midx] [--full-name-hash]
1315

1416
DESCRIPTION
1517
-----------

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
817817
TEST_BUILTINS_OBJS += test-match-trees.o
818818
TEST_BUILTINS_OBJS += test-mergesort.o
819819
TEST_BUILTINS_OBJS += test-mktemp.o
820+
TEST_BUILTINS_OBJS += test-name-hash.o
820821
TEST_BUILTINS_OBJS += test-online-cpus.o
821822
TEST_BUILTINS_OBJS += test-pack-mtimes.o
822823
TEST_BUILTINS_OBJS += test-parse-options.o

builtin/pack-objects.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,14 @@ struct configured_exclusion {
269269
static struct oidmap configured_exclusions;
270270

271271
static struct oidset excluded_by_config;
272+
static int use_full_name_hash = -1;
273+
274+
static inline uint32_t pack_name_hash_fn(const char *name)
275+
{
276+
if (use_full_name_hash)
277+
return pack_full_name_hash(name);
278+
return pack_name_hash(name);
279+
}
272280

273281
/*
274282
* stats
@@ -1686,7 +1694,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
16861694
return 0;
16871695
}
16881696

1689-
create_object_entry(oid, type, pack_name_hash(name),
1697+
create_object_entry(oid, type, pack_name_hash_fn(name),
16901698
exclude, name && no_try_delta(name),
16911699
found_pack, found_offset);
16921700
return 1;
@@ -1900,7 +1908,7 @@ static void add_preferred_base_object(const char *name)
19001908
{
19011909
struct pbase_tree *it;
19021910
size_t cmplen;
1903-
unsigned hash = pack_name_hash(name);
1911+
unsigned hash = pack_name_hash_fn(name);
19041912

19051913
if (!num_preferred_base || check_pbase_path(hash))
19061914
return;
@@ -3410,7 +3418,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
34103418
* here using a now in order to perhaps improve the delta selection
34113419
* process.
34123420
*/
3413-
oe->hash = pack_name_hash(name);
3421+
oe->hash = pack_name_hash_fn(name);
34143422
oe->no_try_delta = name && no_try_delta(name);
34153423

34163424
stdin_packs_hints_nr++;
@@ -3560,7 +3568,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35603568
entry = packlist_find(&to_pack, oid);
35613569
if (entry) {
35623570
if (name) {
3563-
entry->hash = pack_name_hash(name);
3571+
entry->hash = pack_name_hash_fn(name);
35643572
entry->no_try_delta = no_try_delta(name);
35653573
}
35663574
} else {
@@ -3583,7 +3591,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35833591
return;
35843592
}
35853593

3586-
entry = create_object_entry(oid, type, pack_name_hash(name),
3594+
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
35873595
0, name && no_try_delta(name),
35883596
pack, offset);
35893597
}
@@ -4435,6 +4443,8 @@ int cmd_pack_objects(int argc,
44354443
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
44364444
N_("protocol"),
44374445
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
4446+
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
4447+
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
44384448
OPT_END(),
44394449
};
44404450

@@ -4590,6 +4600,11 @@ int cmd_pack_objects(int argc,
45904600
if (pack_to_stdout || !rev_list_all)
45914601
write_bitmap_index = 0;
45924602

4603+
if (write_bitmap_index && use_full_name_hash > 0)
4604+
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
4605+
if (use_full_name_hash < 0)
4606+
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);
4607+
45934608
if (use_delta_islands)
45944609
strvec_push(&rp, "--topo-order");
45954610

builtin/repack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ static int run_update_server_info = 1;
4141
static char *packdir, *packtmp_name, *packtmp;
4242

4343
static const char *const git_repack_usage[] = {
44-
N_("git repack [<options>]"),
44+
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
45+
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
46+
"[--write-midx] [--full-name-hash]"),
4547
NULL
4648
};
4749

@@ -60,6 +62,7 @@ struct pack_objects_args {
6062
int no_reuse_object;
6163
int quiet;
6264
int local;
65+
int full_name_hash;
6366
struct list_objects_filter_options filter_options;
6467
};
6568

@@ -308,6 +311,8 @@ static void prepare_pack_objects(struct child_process *cmd,
308311
strvec_pushf(&cmd->args, "--no-reuse-delta");
309312
if (args->no_reuse_object)
310313
strvec_pushf(&cmd->args, "--no-reuse-object");
314+
if (args->full_name_hash)
315+
strvec_pushf(&cmd->args, "--full-name-hash");
311316
if (args->local)
312317
strvec_push(&cmd->args, "--local");
313318
if (args->quiet)
@@ -1205,6 +1210,8 @@ int cmd_repack(int argc,
12051210
N_("pass --no-reuse-delta to git-pack-objects")),
12061211
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
12071212
N_("pass --no-reuse-object to git-pack-objects")),
1213+
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
1214+
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
12081215
OPT_NEGBIT('n', NULL, &run_update_server_info,
12091216
N_("do not run git-update-server-info"), 1),
12101217
OPT__QUIET(&po_args.quiet, N_("be quiet")),

ci/run-build-and-tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ linux-TEST-vars)
2626
export GIT_TEST_NO_WRITE_REV_INDEX=1
2727
export GIT_TEST_CHECKOUT_WORKERS=2
2828
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
29+
export GIT_TEST_FULL_NAME_HASH=1
2930
;;
3031
linux-clang)
3132
export GIT_TEST_DEFAULT_HASH=sha1

pack-objects.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,27 @@ static inline uint32_t pack_name_hash(const char *name)
208208
return hash;
209209
}
210210

211+
static inline uint32_t pack_full_name_hash(const char *name)
212+
{
213+
const uint32_t bigp = 1234572167U;
214+
uint32_t c, hash = bigp;
215+
216+
if (!name)
217+
return 0;
218+
219+
/*
220+
* Do the simplest thing that will resemble pseudo-randomness: add
221+
* random multiples of a large prime number with a binary shift.
222+
* The goal is not to be cryptographic, but to be generally
223+
* uniformly distributed.
224+
*/
225+
while ((c = *name++) != 0) {
226+
hash += c * bigp;
227+
hash = (hash >> 5) | (hash << 27);
228+
}
229+
return hash;
230+
}
231+
211232
static inline enum object_type oe_type(const struct object_entry *e)
212233
{
213234
return e->type_valid ? e->type_ : OBJ_BAD;

t/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,10 @@ a test and then fails then the whole test run will abort. This can help to make
471471
sure the expected tests are executed and not silently skipped when their
472472
dependency breaks or is simply not present in a new environment.
473473

474+
GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
475+
function in 'git pack-objects' to be the one used by the --full-name-hash
476+
option.
477+
474478
Naming Tests
475479
------------
476480

t/helper/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ test_tool_sources = [
3434
'test-match-trees.c',
3535
'test-mergesort.c',
3636
'test-mktemp.c',
37+
'test-name-hash.c',
3738
'test-online-cpus.c',
3839
'test-pack-mtimes.c',
3940
'test-parse-options.c',

t/helper/test-name-hash.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* test-name-hash.c: Read a list of paths over stdin and report on their
3+
* name-hash and full name-hash.
4+
*/
5+
6+
#include "test-tool.h"
7+
#include "git-compat-util.h"
8+
#include "pack-objects.h"
9+
#include "strbuf.h"
10+
11+
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
12+
{
13+
struct strbuf line = STRBUF_INIT;
14+
15+
while (!strbuf_getline(&line, stdin)) {
16+
uint32_t name_hash = pack_name_hash(line.buf);
17+
uint32_t full_hash = pack_full_name_hash(line.buf);
18+
19+
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
20+
}
21+
22+
strbuf_release(&line);
23+
return 0;
24+
}

0 commit comments

Comments
 (0)