diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..fd97b91a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +repos: + - repo: local + hooks: + - id: dune-build-fmt + name: dune build @fmt + entry: dune build @fmt --auto-promote + pass_filenames: false + language: system + types: [text] + stages: [pre-push] + - id: dune-build-opam + name: dune build + entry: dune build @install + pass_filenames: false + language: system + types: [text] + stages: [pre-push] diff --git a/Makefile b/Makefile index f6190bbd..8a81b8b2 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,8 @@ format: format-check: @dune build $(DUNE_OPTS) @fmt --display=quiet +install-pre-commit-hook: + uvx pre-commit install --hook-type pre-push VERSION=$(shell awk '/^version:/ {print $$2}' containers.opam) diff --git a/benchs/bench_hash.ml b/benchs/bench_hash.ml new file mode 100644 index 00000000..4fc12bf2 --- /dev/null +++ b/benchs/bench_hash.ml @@ -0,0 +1,78 @@ +(** Benchmarks for CCHash primitives. +*) + +[@@@warning "-5"] + +module B = Benchmark + +let repeat = 3 +let n_ints = 1_000 +let ints = Array.init n_ints (fun i -> i * 2654435761) + +let bench_int_hash ~time () = + B.throughputN time ~repeat + [ + ( "CCHash.int", + (fun () -> + Array.iter + (fun x -> ignore @@ Sys.opaque_identity (CCHash.int x)) + ints), + () ); + ( "Hashtbl.hash (poly)", + (fun () -> + Array.iter + (fun x -> ignore @@ Sys.opaque_identity (Hashtbl.hash x)) + ints), + () ); + ( "CCHash.int64", + (fun () -> + Array.iter + (fun x -> + ignore @@ Sys.opaque_identity (CCHash.int64 (Int64.of_int x))) + ints), + () ); + ] + +let bench_combine64 ~time () = + B.throughputN time ~repeat + [ + ( "combine64 chain x5", + (fun () -> + Array.iter + (fun x -> + ignore + @@ Sys.opaque_identity + CCHash64.( + combine2 + (combine2 + (combine2 + (combine2 (Int64.of_int x) + (Int64.of_int (x lxor 0xaaaa))) + (Int64.of_int (x + 1))) + (Int64.of_int (x * 3))) + (Int64.of_int (x lxor (x lsr 7))))) + ints), + () ); + ( "CCHash.list int [1..5]", + (fun () -> + Array.iter + (fun x -> + ignore + @@ Sys.opaque_identity + (Int64.of_int + CCHash.(list int [ x + 1; x + 2; x + 3; x + 4; x + 5 ]))) + ints), + () ); + ] + +(* --- tree for run_global ------------------------------------------------- *) + +let () = + B.Tree.( + register @@ "hash" + @>>> [ + "int" @> lazy (bench_int_hash ~time:2 ()); + "combine64" @> lazy (bench_combine64 ~time:2 ()); + ]) + +let () = try B.Tree.run_global () with Arg.Help msg -> print_endline msg diff --git a/benchs/dune b/benchs/dune index f0d36ef7..ac88101d 100644 --- a/benchs/dune +++ b/benchs/dune @@ -4,6 +4,7 @@ containers containers_pvec containers-data + containers.xxhash benchmark gen iter @@ -22,3 +23,17 @@ -unbox-closures -unbox-closures-factor 20)) + +(executable + (name bench_hash) + (libraries containers benchmark) + (flags :standard -warn-error -3-5 -w -60 -safe-string -color always) + (optional) + (ocamlopt_flags + :standard + -O3 + -color + always + -unbox-closures + -unbox-closures-factor + 20)) diff --git a/benchs/run_bench_hash.ml b/benchs/run_bench_hash.ml index 14cf8830..18f212ba 100644 --- a/benchs/run_bench_hash.ml +++ b/benchs/run_bench_hash.ml @@ -21,7 +21,8 @@ let rec eq t1 t2 = let rec hash_tree t = match t with | Empty -> CCHash.string "empty" - | Node (i, l) -> CCHash.(combine2 (int i) (list hash_tree l)) + | Node (i, l) -> + CCHash.((combine2 [@alert "-deprecated"]) (int i) (list hash_tree l)) module H = Hashtbl.Make (struct type t = tree diff --git a/benchs/run_benchs.ml b/benchs/run_benchs.ml index db23a3c4..dddf908e 100644 --- a/benchs/run_benchs.ml +++ b/benchs/run_benchs.ml @@ -1535,7 +1535,7 @@ module Str = struct let rand_str_ ?(among = "abcdefgh") n = let module Q = QCheck in let st = Random.State.make [| n + 17 |] in - let gen_c = QCheck.Gen.oneofl (CCString.to_list among) in + let gen_c = QCheck.Gen.oneof_list (CCString.to_list among) in QCheck.Gen.string_size ~gen:gen_c (QCheck.Gen.return n) st let find ?(start = 0) ~sub s = @@ -1824,4 +1824,60 @@ module Str = struct ]) end +module Hash = struct + (* Old FNV-based string hash (from before the rrmxmx+fmix64 C implementation) *) + let fnv_offset_basis = 0xcbf29ce484222325L + let fnv_prime = 0x100000001b3L + + let string_fnv (s : string) = + let h = ref fnv_offset_basis in + for i = 0 to String.length s - 1 do + let c = String.unsafe_get s i in + (h := Int64.(mul !h fnv_prime)); + h := Int64.(logxor !h (of_int (Char.code c))) + done; + Int64.to_int !h land max_int + + let rand = Random.State.make [| 42 |] + + let mk_strings n len = + Array.init n (fun _ -> + String.init len (fun _ -> Char.chr (65 + Random.State.int rand 26))) + + let bench_string_hash ?(time = 2) ~len n = + let strings = mk_strings n len in + let bench_new () = + Array.iter (fun s -> opaque_ignore (CCHash.string s)) strings + and bench_fnv () = + Array.iter (fun s -> opaque_ignore (string_fnv s)) strings + and bench_poly () = + Array.iter (fun s -> opaque_ignore (Hashtbl.hash s)) strings + and bench_xxhash () = + Array.iter + (fun s -> + opaque_ignore (Int64.to_int (Containers_xxhash.hash_string s))) + strings + in + B.throughputN time ~repeat + [ + "CCHash.string (new)", bench_new, (); + "string_fnv (old)", bench_fnv, (); + "Hashtbl.hash (poly)", bench_poly, (); + "xxhash", bench_xxhash, (); + ] + + let () = + B.Tree.register + ("hash" + @>>> [ + "string" + @>> B.Tree.concat + [ + app_int (bench_string_hash ~time:2 ~len:16) 1_000; + app_int (bench_string_hash ~time:2 ~len:64) 1_000; + app_int (bench_string_hash ~time:2 ~len:256) 1_000; + ]; + ]) +end + let () = try B.Tree.run_global () with Arg.Help msg -> print_endline msg diff --git a/run_bench_hash.sh b/run_bench_hash.sh index 90114057..cfe3612d 100755 --- a/run_bench_hash.sh +++ b/run_bench_hash.sh @@ -1,4 +1,4 @@ #!/bin/sh OPTS="--profile=release --display=quiet" -exec dune exec $OPTS -- benchs/run_benchs_hash.exe $@ +exec dune exec $OPTS -- ./benchs/run_bench_hash.exe $@ diff --git a/src/core/CCHash.ml b/src/core/CCHash.ml index b4fdc384..99078f04 100644 --- a/src/core/CCHash.ml +++ b/src/core/CCHash.ml @@ -7,130 +7,107 @@ type 'a t = 'a -> hash type 'a iter = ('a -> unit) -> unit type 'a gen = unit -> 'a option -(* FNV hashing - https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function -*) -let fnv_offset_basis = 0xcbf29ce484222325L -let fnv_prime = 0x100000001b3L +let[@inline] combine2 a b = + Hash_impl_.(finalize (combine_int (combine_int seed a) b)) -(* hash an integer *) -let hash_int_ n = - let h = ref fnv_offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff))) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int - -let combine2 a b = - let h = ref fnv_offset_basis in - (* we only do one loop, where we mix bytes of [a] and [b], so as - to simplify control flow *) - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff))) - done; - Int64.to_int !h land max_int - -let[@inline] combine f s x = combine2 s (f x) +let[@inline] combine f s x = + Hash_impl_.(finalize (combine_int (combine_int seed s) (f x))) let combine3 a b c = - let h = ref fnv_offset_basis in - (* we only do one loop, where we mix bytes of [a] [b] and [c], so as - to simplify control flow *) - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff))) - done; - Int64.to_int !h land max_int + Hash_impl_.( + let s = combine_int (combine_int seed a) b in + finalize (combine_int s c)) let combine4 a b c d = - let h = ref fnv_offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - (h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff)))); - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (of_int ((d lsr (k * 8)) land 0xff))) - done; - Int64.to_int !h land max_int + Hash_impl_.( + let s = combine_int (combine_int seed a) b in + let s = combine_int s c in + finalize (combine_int s d)) -let combine5 a b c d e = combine3 a b (combine3 c d e) -let combine6 a b c d e f = combine4 a b c (combine3 d e f) +let combine5 a b c d e = + Hash_impl_.( + let s = combine_int (combine_int seed a) b in + let s = combine_int s c in + let s = combine_int s d in + finalize (combine_int s e)) -(** {2 Combinators} *) +let combine6 a b c d e f = + Hash_impl_.( + let s = combine_int (combine_int seed a) b in + let s = combine_int s c in + let s = combine_int s d in + let s = combine_int s e in + finalize (combine_int s f)) + +(** {2 Primitive hashers} *) let const h _ = h let const0 _ = 0 -let int = hash_int_ +let int n = Hash_impl_.(finalize (combine_int seed n)) let bool b = - hash_int_ + int (if b then 1 else 2) -let char x = hash_int_ (Char.code x) +let char x = Hash_impl_.(finalize (combine_char seed (Char.code x))) +let int64 (n : int64) : int = Hash_impl_.(finalize (combine_i64 seed n)) +let int32 (x : int32) : int = Hash_impl_.(finalize (combine_i32 seed x)) -(* hash an integer *) -let int64 n : int = - let h = ref fnv_offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h fnv_prime)); - h := Int64.(logxor !h (logand (shift_right_logical n (k * 8)) 0xffL)) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int - -let int32 (x : int32) = int64 (Int64.of_int32 x) -let nativeint (x : nativeint) = int64 (Int64.of_nativeint x) - -(* do not hash more than 128 bytes in strings/bytes *) -let max_len_b_ = 128 +let nativeint (x : nativeint) = + Hash_impl_.(finalize (combine_i64 seed (Int64.of_nativeint x))) let bytes (x : bytes) = - let h = ref fnv_offset_basis in - for i = 0 to min max_len_b_ (Bytes.length x - 1) do - (h := Int64.(mul !h fnv_prime)); - let byte = Char.code (Bytes.unsafe_get x i) in - h := Int64.(logxor !h (of_int byte)) - done; - Int64.to_int !h land max_int + Hash_impl_.(finalize (combine_string seed (Bytes.unsafe_to_string x))) -let string (x : string) = bytes (Bytes.unsafe_of_string x) +let string (x : string) = Hash_impl_.(finalize (combine_string seed x)) let slice x i len = let j = i + len in - let rec aux i s = - if i = j then - s + let rec aux k s = + if k = j then + Hash_impl_.finalize s else - aux (i + 1) (combine2 (Char.code x.[i]) s) + aux (k + 1) + (Hash_impl_.combine_char s (Char.code (String.unsafe_get x k))) in - aux i 0 + aux i Hash_impl_.seed let opt f = function | None -> 42 - | Some x -> combine2 43 (f x) + | Some x -> Hash_impl_.(finalize (combine_int (combine_int seed 43) (f x))) -let list f l = List.fold_left (combine f) 0x42 l -let array f l = Array.fold_left (combine f) 0x42 l -let pair f g (x, y) = combine2 (f x) (g y) -let triple f g h (x, y, z) = combine2 (combine2 (f x) (g y)) (h z) +let list f l = + let s = + List.fold_left (fun s x -> Hash_impl_.combine_int s (f x)) Hash_impl_.seed l + in + Hash_impl_.finalize s + +let array f a = + let s = + Array.fold_left + (fun s x -> Hash_impl_.combine_int s (f x)) + Hash_impl_.seed a + in + Hash_impl_.finalize s + +let pair f g (x, y) = + Hash_impl_.(finalize (combine_int (combine_int seed (f x)) (g y))) + +let triple f g h (x, y, z) = + Hash_impl_.( + let s = combine_int seed (f x) in + let s = combine_int s (g y) in + finalize (combine_int s (h z))) let quad f g h i (x, y, z, w) = - combine2 (combine2 (f x) (g y)) (combine2 (h z) (i w)) + Hash_impl_.( + let s = combine_int seed (f x) in + let s = combine_int s (g y) in + let s = combine_int s (h z) in + finalize (combine_int s (i w))) let map f h x = h (f x) @@ -144,8 +121,10 @@ let poly x = Hashtbl.hash x let array_of_hashes_ arr = Array.sort CCInt.compare arr; - (* sort the hashes, so their order does not matter *) - Array.fold_left combine2 0x42 arr + let s = + Array.fold_left (fun s h -> Hash_impl_.combine_int s h) Hash_impl_.seed arr + in + Hash_impl_.finalize s let array_comm f a = let arr = Array.init (Array.length a) (fun i -> f a.(i)) in @@ -157,19 +136,19 @@ let list_comm f l = array_of_hashes_ arr let iter f seq = - let h = ref 0x43 in - seq (fun x -> h := combine f !h x); - !h + let s = ref Hash_impl_.seed in + seq (fun x -> s := Hash_impl_.combine_int !s (f x)); + Hash_impl_.finalize !s -let seq f seq = - let h = ref 0x43 in - Seq.iter (fun x -> h := combine f !h x) seq; - !h +let seq f sq = + let s = ref Hash_impl_.seed in + Seq.iter (fun x -> s := Hash_impl_.combine_int !s (f x)) sq; + Hash_impl_.finalize !s let gen f g = let rec aux s = match g () with - | None -> s - | Some x -> aux (combine2 s (f x)) + | None -> Hash_impl_.finalize s + | Some x -> aux (Hash_impl_.combine_int s (f x)) in - aux 0x42 + aux Hash_impl_.seed diff --git a/src/core/CCHash.mli b/src/core/CCHash.mli index cae1bf89..46140b35 100644 --- a/src/core/CCHash.mli +++ b/src/core/CCHash.mli @@ -9,6 +9,15 @@ across versions of OCaml and Containers). *) +(* TODO: for 4.xx: + + {[type state = int64 + val seed : state + type 'a t = state -> 'a -> state + val finalize : state -> int64 + ]} + *) + (** {2 Definitions} *) type hash = int diff --git a/src/core/CCHash64.ml b/src/core/CCHash64.ml new file mode 100644 index 00000000..4cabb288 --- /dev/null +++ b/src/core/CCHash64.ml @@ -0,0 +1,103 @@ +(* This file is free software, part of containers. See file "license" for more details. *) + +(** {1 Hash combinators with 64-bit state} *) + +type state = int64 + +let seed : state = Hash_impl_.seed +let[@inline] finalize64 (s : state) : int64 = Hash_impl_.fmix64 s +let[@inline] finalize (s : state) : int = Hash_impl_.finalize s + +type 'a t = state -> 'a -> state + +let[@inline] apply h x = finalize64 (h seed x) +let apply_int h x = Int64.to_int (finalize64 (h seed x)) +let[@inline] int s x = Hash_impl_.combine_int s x + +let[@inline] bool s b = + Hash_impl_.combine_int s + (if b then + 1 + else + 2) + +let[@inline] char s c = Hash_impl_.combine_char s (Char.code c) +let[@inline] int64 s (x : int64) = Hash_impl_.combine_i64 s x +let[@inline] int32 s (x : int32) = Hash_impl_.combine_i32 s x + +let[@inline] nativeint s (x : nativeint) = + Hash_impl_.combine_i64 s (Int64.of_nativeint x) + +let[@inline] string s x = Hash_impl_.combine_string s x +let[@inline] bytes s x = Hash_impl_.combine_string s (Bytes.unsafe_to_string x) + +let slice str ofs s len = + let j = ofs + len in + let rec loop k st = + if k = j then + st + else + loop (k + 1) + (Hash_impl_.combine_char st (Char.code (String.unsafe_get str k))) + in + loop ofs s + +let opt f s = function + | None -> Hash_impl_.combine_int s 0 + | Some x -> f (Hash_impl_.combine_int s 1) x + +let list f s l = List.fold_left f s l +let array f s a = Array.fold_left f s a +let pair f g s (x, y) = g (f s x) y +let triple f g h s (x, y, z) = h (g (f s x) y) z +let quad f g h k s (x, y, z, w) = k (h (g (f s x) y) z) w +let map proj f s x = f s (proj x) + +let if_ b then_ else_ s x = + if b then + then_ s x + else + else_ s x + +let poly s x = Hash_impl_.combine_int s (Hashtbl.hash x) + +type 'a iter = ('a -> unit) -> unit +type 'a gen = unit -> 'a option + +let iter f s seq = + let st = ref s in + seq (fun x -> st := f !st x); + !st + +let seq f s sq = Seq.fold_left f s sq + +let gen f s g = + let rec aux st = + match g () with + | None -> st + | Some x -> aux (f st x) + in + aux s + +let[@inline] combine2 a b = + Hash_impl_.(finalize64 (combine_i64 (combine_i64 seed a) b)) + +let combine3 a b c = + Hash_impl_.( + let s = combine_i64 (combine_i64 seed a) b in + finalize64 (combine_i64 s c)) + +let combine4 a b c d = + Hash_impl_.( + let s = combine_i64 (combine_i64 seed a) b in + let s = combine_i64 s c in + finalize64 (combine_i64 s d)) + +let array_comm f s a = + let hashes = Array.map (fun x -> finalize64 (f seed x)) a in + Array.sort Int64.compare hashes; + Array.fold_left Hash_impl_.combine_i64 s hashes + +let list_comm f s l = + let arr = Array.of_list l in + array_comm f s arr diff --git a/src/core/CCHash64.mli b/src/core/CCHash64.mli new file mode 100644 index 00000000..2909e824 --- /dev/null +++ b/src/core/CCHash64.mli @@ -0,0 +1,95 @@ +(** Hash combinators with 64-bit state threading. + + State is threaded explicitly as a private [int64] through combinators. + Finalize with {!finalize} (returns a positive [int]) or {!finalize64} + (returns the raw [int64]). + + Typical use: + {[ + let hash_my_record r = + CCHash64.(finalize (string (int seed r.id) r.name)) + + let hash_pair (a, b) = + CCHash64.(finalize (pair int string seed (a, b))) + ]} + + {b Implementation}: xorshift+multiply combiner with fmix64 (Murmur3) + finalizer, via C stubs. Unboxed in native code. + + @since NEXT_RELEASE +*) + +type state = private int64 +(** Hash accumulator (64-bit). Create with {!seed}; finish with {!finalize} + or {!finalize64}. *) + +val seed : state +(** Initial hash state (golden-ratio constant). *) + +val finalize64 : state -> int64 +(** Apply fmix64 (Murmur3) and return the full 64-bit result. + The result may be negative as a signed [int64]. *) + +val finalize : state -> int +(** Apply fmix64 and return a non-negative [int] (strips sign bit). *) + +type 'a t = state -> 'a -> state +(** A hash combiner: takes the current state, mixes in a value, returns the + updated state. *) + +val apply : 'a t -> 'a -> int64 +(** Hash the input *) + +val apply_int : 'a t -> 'a -> int +(** Hash the input and truncate to [int] *) + +val int : int t +val bool : bool t +val char : char t +val int32 : int32 t +val int64 : int64 t +val nativeint : nativeint t +val string : string t + +val bytes : bytes t +(** @since 3.5 *) + +val slice : string -> int -> int t +(** [slice str ofs s len] mixes the byte slice [str[ofs .. ofs+len-1]] into [s]. *) + +val opt : 'a t -> 'a option t +val list : 'a t -> 'a list t +val array : 'a t -> 'a array t +val pair : 'a t -> 'b t -> ('a * 'b) t +val triple : 'a t -> 'b t -> 'c t -> ('a * 'b * 'c) t +val quad : 'a t -> 'b t -> 'c t -> 'd t -> ('a * 'b * 'c * 'd) t + +val map : ('a -> 'b) -> 'b t -> 'a t +(** [map proj f] applies [proj] before hashing with [f]. + Example: [map fst int] hashes only the first element of a pair. + @since 3.5 *) + +val if_ : bool -> 'a t -> 'a t -> 'a t +(** [if_ b t e] uses hasher [t] when [b] is true, [e] otherwise. *) + +val combine2 : int64 -> int64 -> int64 +val combine3 : int64 -> int64 -> int64 -> int64 +val combine4 : int64 -> int64 -> int64 -> int64 -> int64 + +val poly : 'a t +(** Uses [Hashtbl.hash] internally. *) + +val list_comm : 'a t -> 'a list t +(** Commutative: lists equal up to permutation hash the same. *) + +val array_comm : 'a t -> 'a array t +(** Commutative: arrays equal up to permutation hash the same. *) + +(** {2 Iterators} *) + +type 'a iter = ('a -> unit) -> unit +type 'a gen = unit -> 'a option + +val seq : 'a t -> 'a Seq.t t +val iter : 'a t -> 'a iter t +val gen : 'a t -> 'a gen t diff --git a/src/core/CCInt.ml b/src/core/CCInt.ml index 801b0857..edc6fa6b 100644 --- a/src/core/CCInt.ml +++ b/src/core/CCInt.ml @@ -4,20 +4,8 @@ include Int type 'a iter = ('a -> unit) -> unit -(* use FNV: - https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *) let hash (n : int) : int = - let offset_basis = 0xcbf29ce484222325L in - let prime = 0x100000001b3L in - - let h = ref offset_basis in - for k = 0 to 7 do - (h := Int64.(mul !h prime)); - (* h := h xor (k-th byte of n) *) - h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff))) - done; - (* truncate back to int and remove sign *) - Int64.to_int !h land max_int + Hash_impl_.(finalize (combine_i64 seed (Int64.of_int n))) let range i j yield = let rec up i j yield = diff --git a/src/core/CCInt32.ml b/src/core/CCInt32.ml index 45b3f4a5..e808cfaa 100644 --- a/src/core/CCInt32.ml +++ b/src/core/CCInt32.ml @@ -7,13 +7,9 @@ include Int32 let min : t -> t -> t = Stdlib.min let max : t -> t -> t = Stdlib.max -[@@@endif] -[@@@iflt 5.1] - -let hash x = Stdlib.abs (to_int x) - [@@@endif] +let hash (x : t) : int = Hash_impl_.(finalize (combine_i32 seed x)) let sign i = compare i zero let pow a b = diff --git a/src/core/CCInt64.ml b/src/core/CCInt64.ml index 8bb45ead..4a2446cf 100644 --- a/src/core/CCInt64.ml +++ b/src/core/CCInt64.ml @@ -10,22 +10,8 @@ let max : t -> t -> t = Stdlib.max [@@@endif] let sign i = compare i zero - -(* use FNV: - https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *) -let hash_to_int64 (n : t) = - let offset_basis = 0xcbf29ce484222325L in - let prime = 0x100000001b3L in - - let h = ref offset_basis in - for k = 0 to 7 do - h := mul !h prime; - (* h := h xor (k-th byte of n) *) - h := logxor !h (logand (shift_right n (k * 8)) 0xffL) - done; - logand !h max_int - -let[@inline] hash (n : t) : int = to_int (hash_to_int64 n) land Stdlib.max_int +let hash_to_int64 (n : t) : t = Hash_impl_.(fmix64 (combine_i64 seed n)) +let[@inline] hash (n : t) : int = Hash_impl_.(finalize (combine_i64 seed n)) (* see {!CCInt.popcount} for more details *) let[@inline] popcount (b : t) : int = diff --git a/src/core/CCString.ml b/src/core/CCString.ml index 9188b2a2..048e06f1 100644 --- a/src/core/CCString.ml +++ b/src/core/CCString.ml @@ -11,7 +11,7 @@ include String let compare_int (a : int) b = Stdlib.compare a b let compare = String.compare -let hash s = Hashtbl.hash s +let hash (s : string) : int = Hash_impl_.(finalize (combine_string seed s)) let length = String.length let is_empty s = equal s "" diff --git a/src/core/Hash_impl_.ml b/src/core/Hash_impl_.ml new file mode 100644 index 00000000..1bc8d8d2 --- /dev/null +++ b/src/core/Hash_impl_.ml @@ -0,0 +1,58 @@ +(* This file is free software, part of containers. See file "license" for more details. *) + +(** Internal hash implementation. + + Combiner: [state ^= chunk; state ^= state >> 32; state *= 0xd6e8feb86659fd93] + Finalizer: fmix64 (Murmur3). + + Multiplicative constant 0xd6e8feb86659fd93 (rrmxmx family, Pelle Evensen, 2018): + https://mostlymangling.blogspot.com/2018/07/on-mixing-functions-in-fast-hashing.html + Also evaluated in Chris Wellons' hash-prospector: + https://github.com/skeeto/hash-prospector + + fmix64 constants (Murmur3, Austin Appleby): + https://github.com/aappleby/smhasher + + Not part of the public API; use {!CCHash} instead. *) + +(** Initial hash state (golden-ratio constant). *) +let seed : int64 = 0x9e3779b97f4a7c15L + +external combine_int : + (int64[@unboxed]) -> (int[@untagged]) -> (int64[@unboxed]) + = "caml_cc_hash_combine_int_byte" "caml_cc_hash_combine_int" +[@@noalloc] +(** [combine_int state x] mixes OCaml int [x] into [state]. *) + +external combine_i64 : + (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_hash_combine_i64_byte" "caml_cc_hash_combine_i64" +[@@noalloc] +(** [combine_i64 state chunk] mixes [chunk] into [state]. *) + +external combine_i32 : + (int64[@unboxed]) -> (int32[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_hash_combine_i32_byte" "caml_cc_hash_combine_i32" +[@@noalloc] +(** [combine_i32 state chunk] mixes [chunk] into [state]. *) + +external combine_char : + (int64[@unboxed]) -> (int[@untagged]) -> (int64[@unboxed]) + = "caml_cc_hash_combine_char_byte" "caml_cc_hash_combine_char" +[@@noalloc] +(** [combine_char state c] mixes character code [c] into [state]. *) + +external combine_string : (int64[@unboxed]) -> string -> (int64[@unboxed]) + = "caml_cc_hash_combine_string_byte" "caml_cc_hash_combine_string" +[@@noalloc] +(** [combine_string state s] mixes all bytes of [s] into [state] in 8-byte chunks. *) + +external fmix64 : (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_hash_fmix64_byte" "caml_cc_hash_fmix64" +[@@noalloc] +(** [fmix64 state] applies the Murmur3 finalizer. Result may be negative. *) + +external finalize : (int64[@unboxed]) -> (int[@untagged]) + = "caml_cc_hash_finalize_byte" "caml_cc_hash_finalize" +[@@noalloc] +(** [finalize state] applies fmix64 and returns a non-negative [int]. *) diff --git a/src/core/dune b/src/core/dune index bd362200..28a110d1 100644 --- a/src/core/dune +++ b/src/core/dune @@ -6,7 +6,12 @@ (action (run %{project_root}/src/core/cpp/cpp.exe %{input-file}))) (flags :standard -nolabels -open CCMonomorphic) - (libraries either containers.monomorphic containers.domain)) + (libraries either containers.monomorphic containers.domain) + (private_modules Hash_impl_) + (foreign_stubs + (language c) + (flags :standard -O2) + (names hash_stubs))) (ocamllex (modules CCSexp_lex)) diff --git a/src/core/hash_stubs.c b/src/core/hash_stubs.c new file mode 100644 index 00000000..c0d217a3 --- /dev/null +++ b/src/core/hash_stubs.c @@ -0,0 +1,168 @@ +/* This file is free software, part of containers. See file "license" for more details. */ + +/* Hash implementation: xorshift+multiply combiner with fmix64 finalizer. + Combiner: state ^= chunk; state ^= state >> 32; state *= 0xd6e8feb86659fd93 + Finalizer (fmix64, Murmur3): three rounds of xorshift-multiply. + + Multiplicative constant 0xd6e8feb86659fd93 (rrmxmx family, Pelle Evensen, 2018): + https://mostlymangling.blogspot.com/2018/07/on-mixing-functions-in-fast-hashing.html + Also evaluated in Chris Wellons' hash-prospector: + https://github.com/skeeto/hash-prospector + + fmix64 constants 0xff51afd7ed558ccd / 0xc4ceb9fe1a85ec53 (Murmur3, Austin Appleby): + https://github.com/aappleby/smhasher +*/ + +#include +#include +#include +#include +#include + +// from murmur2: https://chromium.googlesource.com/external/smhasher/+/c8e8bf81bc6041d6d836365a501a0a96830d2d81/MurmurHash2.cpp +#define HASH_MUL UINT64_C(0xc6a4a7935bd1e995) + +// from murmur3: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp#L81 +#define FMIX_C1 UINT64_C(0xff51afd7ed558ccd) +#define FMIX_C2 UINT64_C(0xc4ceb9fe1a85ec53) + +static inline uint64_t hash_combine(uint64_t state, uint64_t chunk) +{ + state ^= chunk; + state ^= state >> 32; + state *= HASH_MUL; + return state; +} + +// fmix64 from murmur3 +static inline uint64_t fmix64(uint64_t h) +{ + h ^= h >> 33; + h *= FMIX_C1; + h ^= h >> 33; + h *= FMIX_C2; + h ^= h >> 33; + return h; +} + +/* --- combine_i64 --------------------------------------------------------- */ + +CAMLprim int64_t caml_cc_hash_combine_i64(int64_t state, int64_t chunk) +{ + return (int64_t)hash_combine((uint64_t)state, (uint64_t)chunk); +} + +CAMLprim value caml_cc_hash_combine_i64_byte(value v_state, value v_chunk) +{ + CAMLparam2(v_state, v_chunk); + uint64_t r = hash_combine((uint64_t)Int64_val(v_state), + (uint64_t)Int64_val(v_chunk)); + CAMLreturn(caml_copy_int64((int64_t)r)); +} + +/* --- combine_i32 --------------------------------------------------------- */ + +CAMLprim int64_t caml_cc_hash_combine_i32(int64_t state, int32_t chunk) +{ + return (int64_t)hash_combine((uint64_t)state, (uint64_t)(uint32_t)chunk); +} + +CAMLprim value caml_cc_hash_combine_i32_byte(value v_state, value v_chunk) +{ + CAMLparam2(v_state, v_chunk); + uint64_t r = hash_combine((uint64_t)Int64_val(v_state), + (uint64_t)(uint32_t)Int32_val(v_chunk)); + CAMLreturn(caml_copy_int64((int64_t)r)); +} + +/* --- combine_int --------------------------------------------------------- */ + +/* chunk is an OCaml int (intnat), passed untagged */ +CAMLprim int64_t caml_cc_hash_combine_int(int64_t state, intnat chunk) +{ + return (int64_t)hash_combine((uint64_t)state, (uint64_t)chunk); +} + +CAMLprim value caml_cc_hash_combine_int_byte(value v_state, value v_chunk) +{ + CAMLparam2(v_state, v_chunk); + uint64_t r = hash_combine((uint64_t)Int64_val(v_state), + (uint64_t)Long_val(v_chunk)); + CAMLreturn(caml_copy_int64((int64_t)r)); +} + +/* --- combine_char -------------------------------------------------------- */ + +/* c is passed as untagged int (Char.code) */ +CAMLprim int64_t caml_cc_hash_combine_char(int64_t state, intnat c) +{ + return (int64_t)hash_combine((uint64_t)state, (uint64_t)(unsigned char)c); +} + +CAMLprim value caml_cc_hash_combine_char_byte(value v_state, value v_c) +{ + CAMLparam2(v_state, v_c); + uint64_t r = hash_combine((uint64_t)Int64_val(v_state), + (uint64_t)(unsigned char)Long_val(v_c)); + CAMLreturn(caml_copy_int64((int64_t)r)); +} + +/* --- combine_string ------------------------------------------------------ */ + +/* Hashes all bytes of [str] into [state] using 8-byte chunks where possible. + [str] is a regular OCaml value; [state] is unboxed int64. */ +CAMLprim int64_t caml_cc_hash_combine_string(int64_t state, value str) +{ + const char *data = String_val(str); + mlsize_t len = caml_string_length(str); + uint64_t s = (uint64_t)state; + mlsize_t i = 0; + + for (; i + 8 <= len; i += 8) { + uint64_t chunk; + memcpy(&chunk, data + i, 8); + s = hash_combine(s, chunk); + } + if (i < len) { + uint64_t chunk = 0; + memcpy(&chunk, data + i, len - i); + s = hash_combine(s, chunk); + } + return (int64_t)s; +} + +CAMLprim value caml_cc_hash_combine_string_byte(value v_state, value str) +{ + CAMLparam2(v_state, str); + int64_t r = caml_cc_hash_combine_string(Int64_val(v_state), str); + CAMLreturn(caml_copy_int64(r)); +} + +/* --- fmix64 -------------------------------------------------------------- */ + +/* Returns full 64-bit fmix64 result; may be "negative" as signed int64. */ +CAMLprim int64_t caml_cc_hash_fmix64(int64_t state) +{ + return (int64_t)fmix64((uint64_t)state); +} + +CAMLprim value caml_cc_hash_fmix64_byte(value v_state) +{ + CAMLparam1(v_state); + CAMLreturn(caml_copy_int64((int64_t)fmix64((uint64_t)Int64_val(v_state)))); +} + +/* --- finalize ------------------------------------------------------------ */ + +/* Applies fmix64 and masks to Max_long (positive OCaml int). */ +CAMLprim intnat caml_cc_hash_finalize(int64_t state) +{ + return (intnat)(fmix64((uint64_t)state) & (uint64_t)Max_long); +} + +CAMLprim value caml_cc_hash_finalize_byte(value v_state) +{ + CAMLparam1(v_state); + intnat r = (intnat)(fmix64((uint64_t)Int64_val(v_state)) & (uint64_t)Max_long); + CAMLreturn(Val_long(r)); +} diff --git a/src/core/tests/dune b/src/core/tests/dune index be3eb1b3..82cc0899 100644 --- a/src/core/tests/dune +++ b/src/core/tests/dune @@ -8,7 +8,14 @@ (name test_hash) (modules test_hash) (flags :standard -warn-error -a+8) - (libraries containers iter)) + (libraries containers iter containers_xxhash)) + +(rule + (alias runtest) + (locks /ctest) + (package containers) + (action + (run ./test_hash.exe))) (executable (name test_random) diff --git a/src/core/tests/test_hash.ml b/src/core/tests/test_hash.ml index f27b9d7d..667cad21 100644 --- a/src/core/tests/test_hash.ml +++ b/src/core/tests/test_hash.ml @@ -1,54 +1,59 @@ (* test hash functions a bit *) -module H = CCHash +module H64 = CCHash64 +module XXH = Containers_xxhash -module Hist = struct - type t = { - tbl: (int, int) Hashtbl.t; - mutable n_samples: int; - } +let n = ref 100_000 +let verbose = ref false - let create () : t = { tbl = Hashtbl.create 32; n_samples = 0 } +let check_bit_proba name hash_fn n_samples = + let rand = Random.State.make [| 42 |] in + let bits = Array.make 64 0 in - let add_n self x n = - Hashtbl.replace self.tbl x (n + try Hashtbl.find self.tbl x with _ -> 0); - self.n_samples <- n + self.n_samples - - let pp out (self : t) : unit = - let max = Hashtbl.fold (fun k _ n -> max k n) self.tbl 0 in - let min = Hashtbl.fold (fun k _ n -> min k n) self.tbl max in - for i = min to max do - let n = try Hashtbl.find self.tbl i with _ -> 0 in - Format.fprintf out "[v=%-4d, n-inputs %-6d] %s@." i n - (String.make (int_of_float @@ ceil (log (float n))) '#') + let n_loops = 30 in + for _i = 1 to n_loops do + let base = Random.State.int64 rand Int64.(pred max_int) |> Int64.to_int in + for i = 1 to n_samples do + let h = hash_fn (base + i) in + for b = 0 to 63 do + if Int64.(logand h (shift_left 1L b)) <> 0L then + bits.(b) <- bits.(b) + 1 + done done -end - -let reset_line = "\x1b[2K\r" - -let t_int n1 n2 = - Printf.printf "test hash_int on %d--%d\n" n1 n2; - let count = Hashtbl.create 128 in - for i = n1 to n2 do - Printf.printf "%shash %d…%!" reset_line i; - let h = H.int i in - Hashtbl.replace count h (1 + CCHashtbl.get_or count h ~default:0); - if i mod 1024 * 1024 * 1024 = 0 then Gc.major () done; - Printf.printf "%s%!" reset_line; - (* reverse table *) - let by_count = - CCHashtbl.to_iter count - |> Iter.map (fun (_h, n) -> n) - |> Iter.count ~hash:H.int - in - let hist = Hist.create () in - by_count (fun (n, i) -> Hist.add_n hist n i); - Format.printf "histogram:@.%a@." Hist.pp hist; - (*assert (Hist.check_uniform hist);*) - () + let n_samples = n_loops * n_samples in + + if !verbose then ( + Format.printf "%s bit probabilities after %d samples:@." name n_samples; + for b = 0 to 63 do + let prob = float bits.(b) /. float n_samples in + Format.printf "bit %2d: %.4f@." b prob + done + ); + let ok = ref true in + for b = 0 to 63 do + let prob = float bits.(b) /. float n_samples in + if prob < 0.48 || prob > 0.52 then ( + Format.printf "FAIL: bit %d has proba %.4f (outside 0.48-0.52)@." b prob; + ok := false + ) + done; + if !ok then + Format.printf "%s: OK@." name + else + (); + !ok + +let speclist = + [ + "-v", Arg.Set verbose, " verbose mode"; + "-n", Arg.Set_int n, " size of the range"; + ] let () = - t_int 0 2_000_000; - t_int (-4_000_000) (-3_500_000); - () + Arg.parse (Arg.align speclist) (fun _ -> ()) "test_hash.exe"; + let ok1 = + check_bit_proba "CCHash64" (fun i -> H64.finalize64 (H64.int H64.seed i)) !n + in + let ok2 = check_bit_proba "XXH" (fun i -> XXH.hash_int i) !n in + if (not ok1) || not ok2 then exit 1 diff --git a/tests/core/t_hash.ml b/tests/core/t_hash.ml index 37b1d6d1..37dce0f4 100644 --- a/tests/core/t_hash.ml +++ b/tests/core/t_hash.ml @@ -17,3 +17,66 @@ t @@ fun () -> string "abc" <> string "abcd";; q Q.int (fun i -> Q.assume (i >= 0); int i = int64 (Int64.of_int i)) +;; + +(* --- stress tests -------------------------------------------------------- *) + +(* Chi-squared distribution test over [count] consecutive integers in [buckets] buckets. + A uniform hash gives chi2 ~ buckets-1; we allow 4 standard deviations of slack. *) +t ~name:"int hash distribution chi2" @@ fun () -> +let count = 50_000 and buckets = 500 in +let counts = Array.make buckets 0 in +for i = 0 to count - 1 do + let b = CCHash.int i mod buckets in + counts.(b) <- counts.(b) + 1 +done; +let expected = float count /. float buckets in +let c2 = + Array.fold_left + (fun acc c -> acc +. (((float c -. expected) ** 2.0) /. expected)) + 0.0 counts +in +let df = float (buckets - 1) in +c2 < df +. (4.0 *. sqrt (2.0 *. df)) +;; + +(* Strict avalanche criterion: flip one input bit, expect ~50% output bits to change. *) +t ~name:"int hash avalanche" @@ fun () -> +let bits = Sys.int_size - 1 in +let total_flips = ref 0 in +let total = ref 0 in +let rng = Random.State.make [| 42; 17; 99 |] in +for _ = 1 to 300 do + let x = Random.State.bits rng in + let hx = CCHash.int x in + for b = 0 to bits - 1 do + let hx' = CCHash.int (x lxor (1 lsl b)) in + total_flips := !total_flips + CCInt.popcount (hx lxor hx'); + total := !total + bits + done +done; +let frac = float !total_flips /. float !total in +frac >= 0.45 && frac <= 0.55 +;; + +(* String hash: no collisions among distinct keys. *) +t ~name:"string hash no collisions" @@ fun () -> +let n = 50_000 in +let tbl = Hashtbl.create n in +let ok = ref true in +for i = 0 to n - 1 do + let h = CCHash.string (Printf.sprintf "key:%d" i) in + if Hashtbl.mem tbl h then ok := false; + Hashtbl.replace tbl h () +done; +!ok +;; + +(* CCHash64 pipeline matches CCHash.pair combiner. *) +q Q.int (fun i -> + let j = i lxor 0xdeadbeef in + let h_pair = CCHash.pair CCHash.int CCHash.int (i, j) in + let h_manual = + CCHash64.(finalize (int (int seed (CCHash.int i)) (CCHash.int j))) + in + h_pair = h_manual)