Compare commits

...

7 commits

Author SHA1 Message Date
Simon Cruanes
cc0679688d
make the hash test stronger 2026-03-19 21:16:55 -04:00
Simon Cruanes
4cafc2d2c5
change cchash constant
from murmur2
2026-03-19 21:16:46 -04:00
Simon Cruanes
f9ba2e8997
cleanup hash bench 2026-03-19 21:05:41 -04:00
Simon Cruanes
501ba3da9a
makefile 2026-03-19 21:05:32 -04:00
Simon Cruanes
76d8ee79e1
cchash64: add apply and combine* 2026-03-19 21:05:23 -04:00
Simon Cruanes
e404dd26ab hash tests: count bit probability 2026-03-17 21:36:22 -04:00
Simon Cruanes
1d5b529730 test hash: add a per-bit distribution check 2026-03-16 13:49:26 +00:00
7 changed files with 133 additions and 115 deletions

View file

@ -24,6 +24,8 @@ format:
format-check:
@dune build $(DUNE_OPTS) @fmt --display=quiet
install-pre-commit-hook:
uvx pre-commit install --hook-type pre-push
VERSION=$(shell awk '/^version:/ {print $$2}' containers.opam)

View file

@ -1,7 +1,4 @@
(** Benchmarks for CCHash primitives.
Run with: dune exec benchs/bench_hash.exe -- [options]
See: dune exec benchs/bench_hash.exe -- --help
*)
[@@@warning "-5"]
@ -9,73 +6,47 @@
module B = Benchmark
let repeat = 3
(* --- data setup ---------------------------------------------------------- *)
let n_ints = 1_000
let ints =
Array.init n_ints (fun i -> i * 2654435761 (* knuth multiplicative *))
let short_str = String.make 16 'x'
let medium_str = String.make 64 'x'
let long_str = String.make 512 'x'
(* Strings of various lengths with distinct content *)
let strings_short = Array.init n_ints (fun i -> Printf.sprintf "%016d" i)
let strings_medium = Array.init n_ints (fun i -> Printf.sprintf "%064d" i)
(* --- benchmarks ---------------------------------------------------------- *)
let ints = Array.init n_ints (fun i -> i * 2654435761)
let bench_int_hash ~time () =
let r = ref 0 in
B.throughputN time ~repeat
[
"CCHash.int", (fun () -> Array.iter (fun x -> r := CCHash.int x) ints), ();
( "CCHash.int",
(fun () ->
Array.iter
(fun x -> ignore @@ Sys.opaque_identity (CCHash.int x))
ints),
() );
( "Hashtbl.hash (poly)",
(fun () -> Array.iter (fun x -> r := Hashtbl.hash x) ints),
(fun () ->
Array.iter
(fun x -> ignore @@ Sys.opaque_identity (Hashtbl.hash x))
ints),
() );
( "CCHash.int64",
(fun () ->
Array.iter (fun x -> r := CCHash.int64 (Int64.of_int x)) ints),
Array.iter
(fun x ->
ignore @@ Sys.opaque_identity (CCHash.int64 (Int64.of_int x)))
ints),
() );
];
ignore !r
let bench_string_hash ~time () =
let r = ref 0 in
B.throughputN time ~repeat
[
( "CCHash.string/16",
(fun () -> Array.iter (fun s -> r := CCHash.string s) strings_short),
() );
( "CCHash.string/64",
(fun () -> Array.iter (fun s -> r := CCHash.string s) strings_medium),
() );
"CCHash.string literal/16", (fun () -> r := CCHash.string short_str), ();
"CCHash.string literal/64", (fun () -> r := CCHash.string medium_str), ();
"CCHash.string literal/512", (fun () -> r := CCHash.string long_str), ();
( "Hashtbl.hash/16",
(fun () -> Array.iter (fun s -> r := Hashtbl.hash s) strings_short),
() );
];
ignore !r
]
let bench_combine64 ~time () =
let r = ref 0L in
B.throughputN time ~repeat
[
( "combine64 chain x5",
(fun () ->
Array.iter
(fun x ->
r :=
CCHash.(
combine64
(combine64
(combine64
(combine64
(combine64 seed (Int64.of_int x))
ignore
@@ Sys.opaque_identity
CCHash64.(
combine2
(combine2
(combine2
(combine2 (Int64.of_int x)
(Int64.of_int (x lxor 0xaaaa)))
(Int64.of_int (x + 1)))
(Int64.of_int (x * 3)))
@ -83,22 +54,25 @@ let bench_combine64 ~time () =
ints),
() );
( "CCHash.list int [1..5]",
(fun () -> r := Int64.of_int CCHash.(list int [ 1; 2; 3; 4; 5 ])),
(fun () ->
Array.iter
(fun x ->
ignore
@@ Sys.opaque_identity
(Int64.of_int
CCHash.(list int [ x + 1; x + 2; x + 3; x + 4; x + 5 ])))
ints),
() );
];
ignore !r
]
(* --- tree for run_global ------------------------------------------------- *)
let () =
B.Tree.add_global "hash"
B.Tree.(
"int"
@>> (fun () -> bench_int_hash ~time:2 ())
@> "string"
@>> (fun () -> bench_string_hash ~time:2 ())
@> "combine64"
@>> (fun () -> bench_combine64 ~time:2 ())
@> nil)
register @@ "hash"
@>>> [
"int" @> lazy (bench_int_hash ~time:2 ());
"combine64" @> lazy (bench_combine64 ~time:2 ());
])
let () = try B.Tree.run_global () with Arg.Help msg -> print_endline msg

View file

@ -10,6 +10,8 @@ let[@inline] finalize (s : state) : int = Hash_impl_.finalize s
type 'a t = state -> 'a -> state
let[@inline] apply h x = finalize64 (h seed x)
let apply_int h x = Int64.to_int (finalize64 (h seed x))
let[@inline] int s x = Hash_impl_.combine_int s x
let[@inline] bool s b =
@ -77,6 +79,20 @@ let gen f s g =
in
aux s
let[@inline] combine2 a b =
Hash_impl_.(finalize64 (combine_i64 (combine_i64 seed a) b))
let combine3 a b c =
Hash_impl_.(
let s = combine_i64 (combine_i64 seed a) b in
finalize64 (combine_i64 s c))
let combine4 a b c d =
Hash_impl_.(
let s = combine_i64 (combine_i64 seed a) b in
let s = combine_i64 s c in
finalize64 (combine_i64 s d))
let array_comm f s a =
let hashes = Array.map (fun x -> finalize64 (f seed x)) a in
Array.sort Int64.compare hashes;

View file

@ -37,6 +37,12 @@ type 'a t = state -> 'a -> state
(** A hash combiner: takes the current state, mixes in a value, returns the
updated state. *)
val apply : 'a t -> 'a -> int64
(** Hash the input *)
val apply_int : 'a t -> 'a -> int
(** Hash the input and truncate to [int] *)
val int : int t
val bool : bool t
val char : char t
@ -66,6 +72,10 @@ val map : ('a -> 'b) -> 'b t -> 'a t
val if_ : bool -> 'a t -> 'a t -> 'a t
(** [if_ b t e] uses hasher [t] when [b] is true, [e] otherwise. *)
val combine2 : int64 -> int64 -> int64
val combine3 : int64 -> int64 -> int64 -> int64
val combine4 : int64 -> int64 -> int64 -> int64 -> int64
val poly : 'a t
(** Uses [Hashtbl.hash] internally. *)

View file

@ -19,7 +19,10 @@
#include <stdint.h>
#include <string.h>
#define HASH_MUL UINT64_C(0xd6e8feb86659fd93)
// from murmur2: https://chromium.googlesource.com/external/smhasher/+/c8e8bf81bc6041d6d836365a501a0a96830d2d81/MurmurHash2.cpp
#define HASH_MUL UINT64_C(0xc6a4a7935bd1e995)
// from murmur3: https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp#L81
#define FMIX_C1 UINT64_C(0xff51afd7ed558ccd)
#define FMIX_C2 UINT64_C(0xc4ceb9fe1a85ec53)
@ -31,6 +34,7 @@ static inline uint64_t hash_combine(uint64_t state, uint64_t chunk)
return state;
}
// fmix64 from murmur3
static inline uint64_t fmix64(uint64_t h)
{
h ^= h >> 33;

View file

@ -8,7 +8,14 @@
(name test_hash)
(modules test_hash)
(flags :standard -warn-error -a+8)
(libraries containers iter))
(libraries containers iter containers_xxhash))
(rule
(alias runtest)
(locks /ctest)
(package containers)
(action
(run ./test_hash.exe)))
(executable
(name test_random)

View file

@ -1,54 +1,59 @@
(* test hash functions a bit *)
module H = CCHash
module H64 = CCHash64
module XXH = Containers_xxhash
module Hist = struct
type t = {
tbl: (int, int) Hashtbl.t;
mutable n_samples: int;
}
let n = ref 100_000
let verbose = ref false
let create () : t = { tbl = Hashtbl.create 32; n_samples = 0 }
let check_bit_proba name hash_fn n_samples =
let rand = Random.State.make [| 42 |] in
let bits = Array.make 64 0 in
let add_n self x n =
Hashtbl.replace self.tbl x (n + try Hashtbl.find self.tbl x with _ -> 0);
self.n_samples <- n + self.n_samples
let pp out (self : t) : unit =
let max = Hashtbl.fold (fun k _ n -> max k n) self.tbl 0 in
let min = Hashtbl.fold (fun k _ n -> min k n) self.tbl max in
for i = min to max do
let n = try Hashtbl.find self.tbl i with _ -> 0 in
Format.fprintf out "[v=%-4d, n-inputs %-6d] %s@." i n
(String.make (int_of_float @@ ceil (log (float n))) '#')
let n_loops = 30 in
for _i = 1 to n_loops do
let base = Random.State.int64 rand Int64.(pred max_int) |> Int64.to_int in
for i = 1 to n_samples do
let h = hash_fn (base + i) in
for b = 0 to 63 do
if Int64.(logand h (shift_left 1L b)) <> 0L then
bits.(b) <- bits.(b) + 1
done
done
end
let reset_line = "\x1b[2K\r"
let t_int n1 n2 =
Printf.printf "test hash_int on %d--%d\n" n1 n2;
let count = Hashtbl.create 128 in
for i = n1 to n2 do
Printf.printf "%shash %d…%!" reset_line i;
let h = H.int i in
Hashtbl.replace count h (1 + CCHashtbl.get_or count h ~default:0);
if i mod 1024 * 1024 * 1024 = 0 then Gc.major ()
done;
Printf.printf "%s%!" reset_line;
(* reverse table *)
let by_count =
CCHashtbl.to_iter count
|> Iter.map (fun (_h, n) -> n)
|> Iter.count ~hash:H.int
in
let hist = Hist.create () in
by_count (fun (n, i) -> Hist.add_n hist n i);
Format.printf "histogram:@.%a@." Hist.pp hist;
(*assert (Hist.check_uniform hist);*)
()
let n_samples = n_loops * n_samples in
if !verbose then (
Format.printf "%s bit probabilities after %d samples:@." name n_samples;
for b = 0 to 63 do
let prob = float bits.(b) /. float n_samples in
Format.printf "bit %2d: %.4f@." b prob
done
);
let ok = ref true in
for b = 0 to 63 do
let prob = float bits.(b) /. float n_samples in
if prob < 0.48 || prob > 0.52 then (
Format.printf "FAIL: bit %d has proba %.4f (outside 0.48-0.52)@." b prob;
ok := false
)
done;
if !ok then
Format.printf "%s: OK@." name
else
();
!ok
let speclist =
[
"-v", Arg.Set verbose, " verbose mode";
"-n", Arg.Set_int n, " size of the range";
]
let () =
t_int 0 2_000_000;
t_int (-4_000_000) (-3_500_000);
()
Arg.parse (Arg.align speclist) (fun _ -> ()) "test_hash.exe";
let ok1 =
check_bit_proba "CCHash64" (fun i -> H64.finalize64 (H64.int H64.seed i)) !n
in
let ok2 = check_bit_proba "XXH" (fun i -> XXH.hash_int i) !n in
if (not ok1) || not ok2 then exit 1