Skip to content

Commit

Permalink
feature: switch to xxHash for digests
Browse files Browse the repository at this point in the history
xxHash is a far faster algorithm with md5.

Signed-off-by: Rudi Grinberg <me@rgrinberg.com>

<!-- ps-id: 4b7a67ae-5e18-49aa-b9ea-12bdf7ca649f -->
  • Loading branch information
rgrinberg committed Mar 23, 2023
1 parent 9bd6070 commit 0405e79
Show file tree
Hide file tree
Showing 13 changed files with 6,696 additions and 15 deletions.
7 changes: 5 additions & 2 deletions boot/duneboot.ml
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,7 @@ end
module Library = struct
module File_kind = struct
type t =
| Header
| C
| Ml
| Mli
Expand All @@ -721,6 +722,7 @@ module Library = struct
let fn = Filename.basename fn in
let i = try String.index fn '.' with Not_found -> String.length fn in
match String.sub fn ~pos:i ~len:(String.length fn - i) with
| ".h" -> Some Header
| ".c" -> Some C
| ".ml" -> Some Ml
| ".mli" -> Some Mli
Expand Down Expand Up @@ -756,6 +758,7 @@ module Library = struct
in
match kind with
| C -> base ^ ".c"
| Header -> base ^ ".h"
| _ -> (
let ext =
match kind with
Expand Down Expand Up @@ -812,7 +815,7 @@ module Library = struct
let modules =
List.fold_left files ~init:StringSet.empty ~f:(fun acc (fn, kind) ->
match (kind : File_kind.t) with
| C -> acc
| Header | C -> acc
| Ml | Mli | Mll | Mly ->
let module_name =
let fn = Filename.basename fn in
Expand All @@ -834,7 +837,7 @@ module Library = struct
let mangled = Wrapper.mangle_filename wrapper fn kind in
let dst = build_dir ^/ mangled in
match kind with
| C ->
| Header | C ->
copy fn dst;
Fiber.return [ mangled ]
| Ml | Mli ->
Expand Down
1 change: 1 addition & 0 deletions boot/libs.ml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ let local_libraries =
; ("src/dune_console", Some "Dune_console", false, None)
; ("src/memo", Some "Memo", false, None)
; ("src/dune_metrics", Some "Dune_metrics", false, None)
; ("vendor/xxh", Some "Dune_xxh", false, None)
; ("src/dune_digest", Some "Dune_digest", false, None)
; ("src/dune_sexp", Some "Dune_sexp", false, None)
; ("src/ocaml-config", Some "Ocaml_config", false, None)
Expand Down
2 changes: 1 addition & 1 deletion src/dune_digest/dune
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
(library
(name dune_digest)
(libraries dune_metrics stdune)
(libraries dune_metrics dune_xxh stdune)
(instrumentation
(backend bisect_ppx)))
29 changes: 19 additions & 10 deletions src/dune_digest/dune_digest.ml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
open Stdune
open Dune_xxh

type t = string

module D = Stdlib.Digest
module Set = String.Set
module Map = String.Map
module Metrics = Dune_metrics
Expand All @@ -14,15 +14,24 @@ module type Digest_impl = sig
end

module Direct_impl : Digest_impl = struct
let file = D.file

let string = D.string
let file f =
let f = Unix.openfile f [ O_RDONLY ] 0 in
try
let res = Xxh.XXH3_128bits.file f in
Unix.close f;
res
with exn ->
let exn = Exn_with_backtrace.capture exn in
Unix.close f;
Exn_with_backtrace.reraise exn

let string s = Xxh.XXH3_128bits.string s
end

module Mutable_impl = struct
let file_ref = ref D.file
let file_ref = ref Direct_impl.file

let string_ref = ref D.string
let string_ref = ref Direct_impl.string

let file f = !file_ref f

Expand All @@ -41,14 +50,14 @@ let equal = String.equal

let file p = Impl.file (Path.to_string p)

let compare x y = Ordering.of_int (D.compare x y)
let compare = String.compare

let to_string = D.to_hex
let to_string = Stdlib.Digest.to_hex

let to_dyn s = Dyn.variant "digest" [ String (to_string s) ]

let from_hex s =
match D.from_hex s with
match Stdlib.Digest.from_hex s with
| s -> Some s
| exception Invalid_argument _ -> None

Expand Down Expand Up @@ -100,7 +109,7 @@ module Path_digest_result = struct

let equal x y =
match (x, y) with
| Ok x, Ok y -> D.equal x y
| Ok x, Ok y -> String.equal x y
| Ok _, _ | _, Ok _ -> false
| Unexpected_kind, Unexpected_kind -> true
| Unexpected_kind, _ | _, Unexpected_kind -> false
Expand Down
2 changes: 1 addition & 1 deletion src/dune_digest/dune_digest.mli
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
open Stdune

(** Digests (MD5) *)
(** Digests *)

type t

Expand Down
2 changes: 1 addition & 1 deletion src/dune_engine/build_system.ml
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ end = struct

(* The current version of the rule digest scheme. We should increment it when
making any changes to the scheme, to avoid collisions. *)
let rule_digest_version = 13
let rule_digest_version = 14

let compute_rule_digest (rule : Rule.t) ~deps ~action ~sandbox_mode
~execution_parameters =
Expand Down
27 changes: 27 additions & 0 deletions vendor/update-xxh.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#!/bin/bash

version=5dba1d3c2e1e5e1c6083f62eaed672c85944d7f7

pkg=xxh

set -e -o pipefail

TMP="$(mktemp -d)"
trap "rm -rf $TMP" EXIT

rm -rf $pkg
mkdir -p $pkg

(
cd $TMP
git clone https://github.com/rgrinberg/$pkg.git
cd $pkg
git checkout $version
)

SRC=$TMP/$pkg

cp -v $SRC/src/*.{ml,mli} $SRC/src/{xxhash.c,xxhash.h,xxh_stubs.c} $pkg/

git checkout $pkg/dune
git add -A .
7 changes: 7 additions & 0 deletions vendor/xxh/dune
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
(library
(name dune_xxh)
(libraries unix)
(foreign_stubs
(language c)
(names xxh_stubs xxhash)
(flags -O3)))
34 changes: 34 additions & 0 deletions vendor/xxh/xxh.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
module type S = sig
type hash

val string : string -> hash
val file : Unix.file_descr -> string

module Stream : sig
type t

val create : unit -> t
val feed_bytes : t -> Bytes.t -> pos:int -> len:int -> unit
val hash : t -> hash
val reset : t -> unit
end
end

module XXH3_128bits = struct
external string : string -> string = "xxh3_128_string"
(* external hash_64bits : string -> Int64.t = "xxh3_64_string" *)

external file : Unix.file_descr -> string = "xxh_128_fd"

module Stream = struct
type t

external create : unit -> t = "xxh_create"

external feed_bytes : t -> Bytes.t -> pos:int -> len:int -> unit
= "xxh_feed_bytes"

external hash : t -> string = "xxh_128bits"
external reset : t -> unit = "xxh_reset"
end
end
25 changes: 25 additions & 0 deletions vendor/xxh/xxh.mli
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
(** Bindings to the xxHash non-cryptographic hash algorithm
https://cyan4973.github.io/xxHash/
*)

module type S = sig
type hash

val string : string -> hash
val file : Unix.file_descr -> string

module Stream : sig
type t

val create : unit -> t
val feed_bytes : t -> Bytes.t -> pos:int -> len:int -> unit
val hash : t -> hash
val reset : t -> unit
end
end

module XXH3_128bits : S with type hash := string
(* TODO *)
(* module XXH3_64bit : S with type hash := int64 *)
(* module XXH64 : S with type hash := int64 *)
117 changes: 117 additions & 0 deletions vendor/xxh/xxh_stubs.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#include <caml/alloc.h>
#include <caml/custom.h>
#include <caml/fail.h>
#include <caml/memory.h>
#include <caml/mlvalues.h>
#include <caml/threads.h>
#include <caml/unixsupport.h>

#include <unistd.h>

#include "xxhash.h"

static inline value alloc_128(XXH128_hash_t hash) {
value v_ret = caml_alloc_string(16);
uint64_t *ret = (uint64_t *)&Byte(v_ret, 0);
uint64_t low = hash.low64;
uint64_t high = hash.high64;
ret[0] = hash.low64;
ret[1] = hash.high64;
return v_ret;
}

CAMLprim value xxh3_128_string(value v_s) {
CAMLparam1(v_s);
CAMLlocal1(v_ret);
int len = caml_string_length(v_s);
XXH128_hash_t hash = XXH3_128bits(&Byte(v_s, 0), len);
v_ret = alloc_128(hash);
CAMLreturn(v_ret);
}

CAMLprim value xxh3_64_string(value v_s) {
CAMLparam1(v_s);
CAMLlocal1(v_ret);
int len = caml_string_length(v_s);
uint64_t hash = XXH3_64bits(&Byte(v_s, 0), len);
v_ret = caml_copy_int64(hash);
CAMLreturn(v_ret);
}

CAMLprim value xxh_128_fd(value v_fd) {
CAMLparam1(v_fd);
CAMLlocal1(v_ret);
int fd = Int_val(v_fd);
caml_release_runtime_system();
XXH3_state_t *state = XXH3_createState();
if (state == NULL)
caml_raise_out_of_memory();
if (XXH3_128bits_reset(state) == XXH_ERROR) {
caml_failwith("xxh_128_fd: failed to reset");
}
char buffer[UNIX_BUFFER_SIZE];
size_t count;
while ((count = read(fd, buffer, sizeof(buffer))) != 0) {
XXH3_128bits_update(state, buffer, count);
}
XXH128_hash_t hash = XXH3_128bits_digest(state);
XXH3_freeState(state);
caml_acquire_runtime_system();
v_ret = alloc_128(hash);
CAMLreturn(v_ret);
}

#define Stream_val(v) (*(XXH3_state_t **)Data_custom_val(v))

static void xxh_stream_finalize(value v_state) {
XXH3_state_t *state = Stream_val(v_state);
XXH3_freeState(state);
}

static struct custom_operations xxh_stream_t_ops = {
"xxh.stream",
xxh_stream_finalize,
custom_compare_default,
custom_hash_default,
custom_serialize_default,
custom_deserialize_default,
custom_compare_ext_default,
custom_fixed_length_default};

CAMLprim value xxh_create(value v_unit) {
CAMLparam1(v_unit);
XXH3_state_t *state = XXH3_createState();
if (XXH3_128bits_reset(state) == XXH_ERROR) {
caml_failwith("xxh_128bits_reset: failed to reset");
}
value v_state =
caml_alloc_custom(&xxh_stream_t_ops, sizeof(XXH3_state_t *), 0, 1);
Stream_val(v_state) = state;
CAMLreturn(v_state);
}

CAMLprim value xxh_feed_bytes(value v_state, value v_bytes, value v_pos,
value v_len) {
CAMLparam4(v_state, v_bytes, v_pos, v_len);
XXH3_state_t *state = Stream_val(v_state);
XXH3_128bits_update(state, &Byte(v_bytes, Int_val(v_pos)), Int_val(v_len));
CAMLreturn(Val_unit);
}

CAMLprim value xxh_128bits(value v_state) {
CAMLparam1(v_state);
CAMLlocal1(v_ret);
XXH3_state_t *state = Stream_val(v_state);
XXH128_hash_t hash = XXH3_128bits_digest(state);
v_ret = alloc_128(hash);
CAMLreturn(v_ret);
}

CAMLprim value xxh_reset(value v_state) {
CAMLparam1(v_state);
XXH3_state_t *state = Stream_val(v_state);
if (XXH3_128bits_reset(state) == XXH_ERROR) {
caml_failwith("xxh_128bits_reset: failed to reset");
}
CAMLreturn(Val_unit);
}
43 changes: 43 additions & 0 deletions vendor/xxh/xxhash.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2021 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/


/*
* xxhash.c instantiates functions defined in xxhash.h
*/

#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */

#include "xxhash.h"
Loading

0 comments on commit 0405e79

Please sign in to comment.