Skip to content

Commit df498b6

Browse files
committed
pack-objects: create new name-hash algorithm (#5157)
This is an updated version of gitgitgadget#1785, intended for early consumption into Git for Windows. The idea here is to add a new `--full-name-hash` option to `git pack-objects` and `git repack`. This adjusts the name-hash value used for finding delta bases in such a way that uses the full path name with a lower likelihood of collisions than the default name-hash algorithm. In many repositories with name-hash collisions and many versions of those paths, this can significantly reduce the size of a full repack. It can also help in certain cases of `git push`, but only if the pack is already artificially inflated by name-hash collisions; cases that find "sibling" deltas as better choices become worse with `--full-name-hash`. Thus, this option is currently recommended for full repacks of large repos, and on client machines without reachability bitmaps. Some care is taken to ignore this option when using bitmaps, either writing bitmaps or using a bitmap walk during reads. The bitmap file format contains name-hash values, but no way to indicate which function is used, so compatibility is a concern for bitmaps. Future work could explore this idea. After this PR is merged, then the more-involved `--path-walk` option may be considered.
2 parents 1050bd5 + c34be83 commit df498b6

22 files changed

+311
-13
lines changed

Documentation/git-pack-objects.adoc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ SYNOPSIS
1515
[--revs [--unpacked | --all]] [--keep-pack=<pack-name>]
1616
[--cruft] [--cruft-expiration=<time>]
1717
[--stdout [--filter=<filter-spec>] | <base-name>]
18-
[--shallow] [--keep-true-parents] [--[no-]sparse] < <object-list>
18+
[--shallow] [--keep-true-parents] [--[no-]sparse]
19+
[--full-name-hash] < <object-list>
1920

2021

2122
DESCRIPTION

Documentation/git-repack.adoc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,9 @@ git-repack - Pack unpacked objects in a repository
99
SYNOPSIS
1010
--------
1111
[verse]
12-
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m] [--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>] [--write-midx]
12+
'git repack' [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]
13+
[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]
14+
[--write-midx] [--full-name-hash]
1315

1416
DESCRIPTION
1517
-----------

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -817,6 +817,7 @@ TEST_BUILTINS_OBJS += test-lazy-init-name-hash.o
817817
TEST_BUILTINS_OBJS += test-match-trees.o
818818
TEST_BUILTINS_OBJS += test-mergesort.o
819819
TEST_BUILTINS_OBJS += test-mktemp.o
820+
TEST_BUILTINS_OBJS += test-name-hash.o
820821
TEST_BUILTINS_OBJS += test-online-cpus.o
821822
TEST_BUILTINS_OBJS += test-pack-mtimes.o
822823
TEST_BUILTINS_OBJS += test-parse-options.o

builtin/pack-objects.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,14 @@ struct configured_exclusion {
269269
static struct oidmap configured_exclusions;
270270

271271
static struct oidset excluded_by_config;
272+
static int use_full_name_hash = -1;
273+
274+
static inline uint32_t pack_name_hash_fn(const char *name)
275+
{
276+
if (use_full_name_hash)
277+
return pack_full_name_hash(name);
278+
return pack_name_hash(name);
279+
}
272280

273281
/*
274282
* stats
@@ -1687,7 +1695,7 @@ static int add_object_entry(const struct object_id *oid, enum object_type type,
16871695
return 0;
16881696
}
16891697

1690-
create_object_entry(oid, type, pack_name_hash(name),
1698+
create_object_entry(oid, type, pack_name_hash_fn(name),
16911699
exclude, name && no_try_delta(name),
16921700
found_pack, found_offset);
16931701
return 1;
@@ -1901,7 +1909,7 @@ static void add_preferred_base_object(const char *name)
19011909
{
19021910
struct pbase_tree *it;
19031911
size_t cmplen;
1904-
unsigned hash = pack_name_hash(name);
1912+
unsigned hash = pack_name_hash_fn(name);
19051913

19061914
if (!num_preferred_base || check_pbase_path(hash))
19071915
return;
@@ -3413,7 +3421,7 @@ static void show_object_pack_hint(struct object *object, const char *name,
34133421
* here using a now in order to perhaps improve the delta selection
34143422
* process.
34153423
*/
3416-
oe->hash = pack_name_hash(name);
3424+
oe->hash = pack_name_hash_fn(name);
34173425
oe->no_try_delta = name && no_try_delta(name);
34183426

34193427
stdin_packs_hints_nr++;
@@ -3563,7 +3571,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35633571
entry = packlist_find(&to_pack, oid);
35643572
if (entry) {
35653573
if (name) {
3566-
entry->hash = pack_name_hash(name);
3574+
entry->hash = pack_name_hash_fn(name);
35673575
entry->no_try_delta = no_try_delta(name);
35683576
}
35693577
} else {
@@ -3586,7 +3594,7 @@ static void add_cruft_object_entry(const struct object_id *oid, enum object_type
35863594
return;
35873595
}
35883596

3589-
entry = create_object_entry(oid, type, pack_name_hash(name),
3597+
entry = create_object_entry(oid, type, pack_name_hash_fn(name),
35903598
0, name && no_try_delta(name),
35913599
pack, offset);
35923600
}
@@ -4441,6 +4449,8 @@ int cmd_pack_objects(int argc,
44414449
OPT_STRING_LIST(0, "uri-protocol", &uri_protocols,
44424450
N_("protocol"),
44434451
N_("exclude any configured uploadpack.blobpackfileuri with this protocol")),
4452+
OPT_BOOL(0, "full-name-hash", &use_full_name_hash,
4453+
N_("(EXPERIMENTAL!) optimize delta compression across identical path names over time")),
44444454
OPT_END(),
44454455
};
44464456

@@ -4596,6 +4606,11 @@ int cmd_pack_objects(int argc,
45964606
if (pack_to_stdout || !rev_list_all)
45974607
write_bitmap_index = 0;
45984608

4609+
if (write_bitmap_index && use_full_name_hash > 0)
4610+
die(_("currently, the --full-name-hash option is incompatible with --write-bitmap-index"));
4611+
if (use_full_name_hash < 0)
4612+
use_full_name_hash = git_env_bool("GIT_TEST_FULL_NAME_HASH", 0);
4613+
45994614
if (use_delta_islands)
46004615
strvec_push(&rp, "--topo-order");
46014616

builtin/repack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@ static int run_update_server_info = 1;
4141
static char *packdir, *packtmp_name, *packtmp;
4242

4343
static const char *const git_repack_usage[] = {
44-
N_("git repack [<options>]"),
44+
N_("git repack [-a] [-A] [-d] [-f] [-F] [-l] [-n] [-q] [-b] [-m]\n"
45+
"[--window=<n>] [--depth=<n>] [--threads=<n>] [--keep-pack=<pack-name>]\n"
46+
"[--write-midx] [--full-name-hash]"),
4547
NULL
4648
};
4749

@@ -60,6 +62,7 @@ struct pack_objects_args {
6062
int no_reuse_object;
6163
int quiet;
6264
int local;
65+
int full_name_hash;
6366
struct list_objects_filter_options filter_options;
6467
};
6568

@@ -308,6 +311,8 @@ static void prepare_pack_objects(struct child_process *cmd,
308311
strvec_pushf(&cmd->args, "--no-reuse-delta");
309312
if (args->no_reuse_object)
310313
strvec_pushf(&cmd->args, "--no-reuse-object");
314+
if (args->full_name_hash)
315+
strvec_pushf(&cmd->args, "--full-name-hash");
311316
if (args->local)
312317
strvec_push(&cmd->args, "--local");
313318
if (args->quiet)
@@ -1205,6 +1210,8 @@ int cmd_repack(int argc,
12051210
N_("pass --no-reuse-delta to git-pack-objects")),
12061211
OPT_BOOL('F', NULL, &po_args.no_reuse_object,
12071212
N_("pass --no-reuse-object to git-pack-objects")),
1213+
OPT_BOOL(0, "full-name-hash", &po_args.full_name_hash,
1214+
N_("(EXPERIMENTAL!) pass --full-name-hash to git-pack-objects")),
12081215
OPT_NEGBIT('n', NULL, &run_update_server_info,
12091216
N_("do not run git-update-server-info"), 1),
12101217
OPT__QUIET(&po_args.quiet, N_("be quiet")),

ci/run-build-and-tests.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ linux-TEST-vars)
2626
export GIT_TEST_NO_WRITE_REV_INDEX=1
2727
export GIT_TEST_CHECKOUT_WORKERS=2
2828
export GIT_TEST_PACK_USE_BITMAP_BOUNDARY_TRAVERSAL=1
29+
export GIT_TEST_FULL_NAME_HASH=1
2930
;;
3031
linux-clang)
3132
export GIT_TEST_DEFAULT_HASH=sha1

pack-objects.h

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -208,6 +208,27 @@ static inline uint32_t pack_name_hash(const char *name)
208208
return hash;
209209
}
210210

211+
static inline uint32_t pack_full_name_hash(const char *name)
212+
{
213+
const uint32_t bigp = 1234572167U;
214+
uint32_t c, hash = bigp;
215+
216+
if (!name)
217+
return 0;
218+
219+
/*
220+
* Do the simplest thing that will resemble pseudo-randomness: add
221+
* random multiples of a large prime number with a binary shift.
222+
* The goal is not to be cryptographic, but to be generally
223+
* uniformly distributed.
224+
*/
225+
while ((c = *name++) != 0) {
226+
hash += c * bigp;
227+
hash = (hash >> 5) | (hash << 27);
228+
}
229+
return hash;
230+
}
231+
211232
static inline enum object_type oe_type(const struct object_entry *e)
212233
{
213234
return e->type_valid ? e->type_ : OBJ_BAD;

t/README

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -471,6 +471,10 @@ a test and then fails then the whole test run will abort. This can help to make
471471
sure the expected tests are executed and not silently skipped when their
472472
dependency breaks or is simply not present in a new environment.
473473

474+
GIT_TEST_FULL_NAME_HASH=<boolean>, when true, sets the default name-hash
475+
function in 'git pack-objects' to be the one used by the --full-name-hash
476+
option.
477+
474478
Naming Tests
475479
------------
476480

t/helper/meson.build

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ test_tool_sources = [
3434
'test-match-trees.c',
3535
'test-mergesort.c',
3636
'test-mktemp.c',
37+
'test-name-hash.c',
3738
'test-online-cpus.c',
3839
'test-pack-mtimes.c',
3940
'test-parse-options.c',

t/helper/test-name-hash.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* test-name-hash.c: Read a list of paths over stdin and report on their
3+
* name-hash and full name-hash.
4+
*/
5+
6+
#include "test-tool.h"
7+
#include "git-compat-util.h"
8+
#include "pack-objects.h"
9+
#include "strbuf.h"
10+
11+
int cmd__name_hash(int argc UNUSED, const char **argv UNUSED)
12+
{
13+
struct strbuf line = STRBUF_INIT;
14+
15+
while (!strbuf_getline(&line, stdin)) {
16+
uint32_t name_hash = pack_name_hash(line.buf);
17+
uint32_t full_hash = pack_full_name_hash(line.buf);
18+
19+
printf("%10"PRIu32"\t%10"PRIu32"\t%s\n", name_hash, full_hash, line.buf);
20+
}
21+
22+
strbuf_release(&line);
23+
return 0;
24+
}

0 commit comments

Comments
 (0)