hash mixer and combiner in C

This commit is contained in:
Simon Cruanes 2026-03-14 19:28:55 +00:00
parent a20eddfdd3
commit 7fdee4a17e
9 changed files with 351 additions and 139 deletions

View file

@ -7,130 +7,125 @@ type 'a t = 'a -> hash
type 'a iter = ('a -> unit) -> unit
type 'a gen = unit -> 'a option
(* FNV hashing
https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function
*)
let fnv_offset_basis = 0xcbf29ce484222325L
let fnv_prime = 0x100000001b3L
(** {2 Full-strength int64 API} *)
(* hash an integer *)
let hash_int_ n =
let h = ref fnv_offset_basis in
for k = 0 to 7 do
(h := Int64.(mul !h fnv_prime));
h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff)))
done;
(* truncate back to int and remove sign *)
Int64.to_int !h land max_int
let seed : int64 = Hash_impl_.seed
let combine2 a b =
let h = ref fnv_offset_basis in
(* we only do one loop, where we mix bytes of [a] and [b], so as
to simplify control flow *)
for k = 0 to 7 do
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff)))
done;
Int64.to_int !h land max_int
let[@inline] combine64 (s : int64) (c : int64) : int64 = Hash_impl_.combine_i64 s c
let[@inline] combine f s x = combine2 s (f x)
let[@inline] finalize (s : int64) : int = Hash_impl_.finalize s
let[@inline] finalize_i64 (s : int64) : int64 = Hash_impl_.fmix64 s
(** {2 Deprecated int-state combinators} *)
let[@inline] combine2 a b =
Hash_impl_.(finalize (combine_i64 (Int64.of_int a) (Int64.of_int b)))
let[@inline] combine f s x =
Hash_impl_.(finalize (combine_i64 (Int64.of_int s) (Int64.of_int (f x))))
let combine3 a b c =
let h = ref fnv_offset_basis in
(* we only do one loop, where we mix bytes of [a] [b] and [c], so as
to simplify control flow *)
for k = 0 to 7 do
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff)))
done;
Int64.to_int !h land max_int
Hash_impl_.(
let s = combine_i64 (Int64.of_int a) (Int64.of_int b) in
finalize (combine_i64 s (Int64.of_int c)))
let combine4 a b c d =
let h = ref fnv_offset_basis in
for k = 0 to 7 do
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((a lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((b lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
(h := Int64.(logxor !h (of_int ((c lsr (k * 8)) land 0xff))));
(h := Int64.(mul !h fnv_prime));
h := Int64.(logxor !h (of_int ((d lsr (k * 8)) land 0xff)))
done;
Int64.to_int !h land max_int
Hash_impl_.(
let s = combine_i64 (Int64.of_int a) (Int64.of_int b) in
let s = combine_i64 s (Int64.of_int c) in
finalize (combine_i64 s (Int64.of_int d)))
let combine5 a b c d e = combine3 a b (combine3 c d e)
let combine6 a b c d e f = combine4 a b c (combine3 d e f)
let combine5 a b c d e =
Hash_impl_.(
let s = combine_i64 (Int64.of_int a) (Int64.of_int b) in
let s = combine_i64 s (Int64.of_int c) in
let s = combine_i64 s (Int64.of_int d) in
finalize (combine_i64 s (Int64.of_int e)))
(** {2 Combinators} *)
let combine6 a b c d e f =
Hash_impl_.(
let s = combine_i64 (Int64.of_int a) (Int64.of_int b) in
let s = combine_i64 s (Int64.of_int c) in
let s = combine_i64 s (Int64.of_int d) in
let s = combine_i64 s (Int64.of_int e) in
finalize (combine_i64 s (Int64.of_int f)))
(** {2 Primitive hashers} *)
let const h _ = h
let const0 _ = 0
let int = hash_int_
let int n = Hash_impl_.(finalize (combine_i64 seed (Int64.of_int n)))
let bool b =
hash_int_
int
(if b then
1
else
2)
let char x = hash_int_ (Char.code x)
let char x = Hash_impl_.(finalize (combine_char seed (Char.code x)))
(* hash an integer *)
let int64 n : int =
let h = ref fnv_offset_basis in
for k = 0 to 7 do
(h := Int64.(mul !h fnv_prime));
h := Int64.(logxor !h (logand (shift_right_logical n (k * 8)) 0xffL))
done;
(* truncate back to int and remove sign *)
Int64.to_int !h land max_int
let int64 (n : int64) : int = Hash_impl_.(finalize (combine_i64 seed n))
let int32 (x : int32) : int = Hash_impl_.(finalize (combine_i32 seed x))
let int32 (x : int32) = int64 (Int64.of_int32 x)
let nativeint (x : nativeint) = int64 (Int64.of_nativeint x)
(* do not hash more than 128 bytes in strings/bytes *)
let max_len_b_ = 128
let bytes (x : bytes) =
let h = ref fnv_offset_basis in
for i = 0 to min max_len_b_ (Bytes.length x - 1) do
(h := Int64.(mul !h fnv_prime));
let byte = Char.code (Bytes.unsafe_get x i) in
h := Int64.(logxor !h (of_int byte))
done;
Int64.to_int !h land max_int
Hash_impl_.(finalize (combine_string seed (Bytes.unsafe_to_string x)))
let string (x : string) = bytes (Bytes.unsafe_of_string x)
let string (x : string) = Hash_impl_.(finalize (combine_string seed x))
let slice x i len =
let j = i + len in
let rec aux i s =
if i = j then
s
let rec aux k s =
if k = j then
Hash_impl_.finalize s
else
aux (i + 1) (combine2 (Char.code x.[i]) s)
aux (k + 1)
(Hash_impl_.combine_char s (Char.code (String.unsafe_get x k)))
in
aux i 0
aux i Hash_impl_.seed
let opt f = function
| None -> 42
| Some x -> combine2 43 (f x)
| Some x ->
Hash_impl_.(finalize (combine_i64 (combine_i64 seed 43L) (Int64.of_int (f x))))
let list f l = List.fold_left (combine f) 0x42 l
let array f l = Array.fold_left (combine f) 0x42 l
let pair f g (x, y) = combine2 (f x) (g y)
let triple f g h (x, y, z) = combine2 (combine2 (f x) (g y)) (h z)
let list f l =
let s =
List.fold_left
(fun s x -> Hash_impl_.combine_i64 s (Int64.of_int (f x)))
Hash_impl_.seed l
in
Hash_impl_.finalize s
let array f a =
let s =
Array.fold_left
(fun s x -> Hash_impl_.combine_i64 s (Int64.of_int (f x)))
Hash_impl_.seed a
in
Hash_impl_.finalize s
let pair f g (x, y) =
Hash_impl_.(
finalize (combine_i64 (combine_i64 seed (Int64.of_int (f x))) (Int64.of_int (g y))))
let triple f g h (x, y, z) =
Hash_impl_.(
let s = combine_i64 seed (Int64.of_int (f x)) in
let s = combine_i64 s (Int64.of_int (g y)) in
finalize (combine_i64 s (Int64.of_int (h z))))
let quad f g h i (x, y, z, w) =
combine2 (combine2 (f x) (g y)) (combine2 (h z) (i w))
Hash_impl_.(
let s = combine_i64 seed (Int64.of_int (f x)) in
let s = combine_i64 s (Int64.of_int (g y)) in
let s = combine_i64 s (Int64.of_int (h z)) in
finalize (combine_i64 s (Int64.of_int (i w))))
let map f h x = h (f x)
@ -144,8 +139,12 @@ let poly x = Hashtbl.hash x
let array_of_hashes_ arr =
Array.sort CCInt.compare arr;
(* sort the hashes, so their order does not matter *)
Array.fold_left combine2 0x42 arr
let s =
Array.fold_left
(fun s h -> Hash_impl_.combine_i64 s (Int64.of_int h))
Hash_impl_.seed arr
in
Hash_impl_.finalize s
let array_comm f a =
let arr = Array.init (Array.length a) (fun i -> f a.(i)) in
@ -157,19 +156,19 @@ let list_comm f l =
array_of_hashes_ arr
let iter f seq =
let h = ref 0x43 in
seq (fun x -> h := combine f !h x);
!h
let s = ref Hash_impl_.seed in
seq (fun x -> s := Hash_impl_.combine_i64 !s (Int64.of_int (f x)));
Hash_impl_.finalize !s
let seq f seq =
let h = ref 0x43 in
Seq.iter (fun x -> h := combine f !h x) seq;
!h
let seq f sq =
let s = ref Hash_impl_.seed in
Seq.iter (fun x -> s := Hash_impl_.combine_i64 !s (Int64.of_int (f x))) sq;
Hash_impl_.finalize !s
let gen f g =
let rec aux s =
match g () with
| None -> s
| Some x -> aux (combine2 s (f x))
| None -> Hash_impl_.finalize s
| Some x -> aux (Hash_impl_.combine_i64 s (Int64.of_int (f x)))
in
aux 0x42
aux Hash_impl_.seed

View file

@ -1,5 +1,3 @@
(* This file is free software, part of containers. See file "license" for more details. *)
(** Hash combinators
The API of this module is stable as per semantic versioning, like the
@ -7,8 +5,20 @@
can change and should not be relied on (i.e. hashing a value always
returns the same integer {b within a run of a program}, not
across versions of OCaml and Containers).
{b Implementation}: xorshift+multiply combiner with fmix64 (Murmur3) finalizer,
via C stubs. Unboxed in native code, boxed in bytecode.
*)
(* TODO: for 4.xx:
{[type state = int64
val seed : state
type 'a t = state -> 'a -> state
val finalize : state -> int64
]}
*)
(** {2 Definitions} *)
type hash = int
@ -34,8 +44,7 @@ val int64 : int64 t
val nativeint : nativeint t
val slice : string -> int -> int t
(** [slice s i len state] hashes the slice [i, …, i+len-1] of [s]
into [state]. *)
(** [slice s i len] hashes the slice [s[i .. i+len-1]]. *)
val bytes : bytes t
(** Hash a byte array.
@ -79,17 +88,47 @@ val array_comm : 'a t -> 'a array t
will have the same hash.
@since 1.0 *)
(** {2 Base hash combinators} *)
(** {2 Full-strength int64 API} *)
val seed : int64
(** Initial hash state. *)
val combine64 : int64 -> int64 -> int64
(** [combine64 state chunk] mixes [chunk] into [state] using the
xorshift+multiply combiner. Suitable for building streaming hashers
with full 64-bit state. Finalize with {!finalize} or {!finalize_i64}. *)
val finalize : int64 -> int
(** [finalize state] applies fmix64 (Murmur3 finalizer) and returns a
non-negative [int] (strips sign bit). *)
val finalize_i64 : int64 -> int64
(** [finalize_i64 state] applies fmix64 and returns the full 64-bit result.
The result may be negative as a signed [int64]. *)
(** {2 Deprecated int-state combinators}
These thread state as [int] (63 bits on 64-bit systems), which is lossy.
Prefer building a pipeline with {!seed}, {!combine64}, and {!finalize}. *)
val combine : 'a t -> hash -> 'a -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
val combine2 : hash -> hash -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
val combine3 : hash -> hash -> hash -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
val combine4 : hash -> hash -> hash -> hash -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
val combine5 : hash -> hash -> hash -> hash -> hash -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
(** @since 2.1 *)
val combine6 : hash -> hash -> hash -> hash -> hash -> hash -> hash
[@@deprecated "lossy (63-bit state); use combine64 with int64 state"]
(** @since 2.1 *)
(** {2 Iterators} *)

View file

@ -4,20 +4,8 @@ include Int
type 'a iter = ('a -> unit) -> unit
(* use FNV:
https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *)
let hash (n : int) : int =
let offset_basis = 0xcbf29ce484222325L in
let prime = 0x100000001b3L in
let h = ref offset_basis in
for k = 0 to 7 do
(h := Int64.(mul !h prime));
(* h := h xor (k-th byte of n) *)
h := Int64.(logxor !h (of_int ((n lsr (k * 8)) land 0xff)))
done;
(* truncate back to int and remove sign *)
Int64.to_int !h land max_int
Hash_impl_.(finalize (combine_i64 seed (Int64.of_int n)))
let range i j yield =
let rec up i j yield =

View file

@ -8,11 +8,7 @@ let min : t -> t -> t = Stdlib.min
let max : t -> t -> t = Stdlib.max
[@@@endif]
[@@@iflt 5.1]
let hash x = Stdlib.abs (to_int x)
[@@@endif]
let hash (x : t) : int = Hash_impl_.(finalize (combine_i32 seed x))
let sign i = compare i zero

View file

@ -11,21 +11,9 @@ let max : t -> t -> t = Stdlib.max
let sign i = compare i zero
(* use FNV:
https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function *)
let hash_to_int64 (n : t) =
let offset_basis = 0xcbf29ce484222325L in
let prime = 0x100000001b3L in
let hash_to_int64 (n : t) : t = Hash_impl_.(fmix64 (combine_i64 seed n))
let h = ref offset_basis in
for k = 0 to 7 do
h := mul !h prime;
(* h := h xor (k-th byte of n) *)
h := logxor !h (logand (shift_right n (k * 8)) 0xffL)
done;
logand !h max_int
let[@inline] hash (n : t) : int = to_int (hash_to_int64 n) land Stdlib.max_int
let[@inline] hash (n : t) : int = Hash_impl_.(finalize (combine_i64 seed n))
(* see {!CCInt.popcount} for more details *)
let[@inline] popcount (b : t) : int =

View file

@ -11,7 +11,7 @@ include String
let compare_int (a : int) b = Stdlib.compare a b
let compare = String.compare
let hash s = Hashtbl.hash s
let hash (s : string) : int = Hash_impl_.(finalize (combine_string seed s))
let length = String.length
let is_empty s = equal s ""

49
src/core/Hash_impl_.ml Normal file
View file

@ -0,0 +1,49 @@
(* This file is free software, part of containers. See file "license" for more details. *)
(** Internal hash implementation.
Combiner: [state ^= chunk; state ^= state >> 32; state *= 0xd6e8feb86659fd93]
Finalizer: fmix64 (Murmur3).
Multiplicative constant 0xd6e8feb86659fd93 (rrmxmx family, Pelle Evensen, 2018):
https://mostlymangling.blogspot.com/2018/07/on-mixing-functions-in-fast-hashing.html
Also evaluated in Chris Wellons' hash-prospector:
https://github.com/skeeto/hash-prospector
fmix64 constants (Murmur3, Austin Appleby):
https://github.com/aappleby/smhasher
Not part of the public API; use {!CCHash} instead. *)
(** Initial hash state (golden-ratio constant). *)
let seed : int64 = 0x9e3779b97f4a7c15L
external combine_i64 : (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_hash_combine_i64_byte" "caml_cc_hash_combine_i64"
[@@noalloc]
(** [combine_i64 state chunk] mixes [chunk] into [state]. *)
external combine_i32 : (int64[@unboxed]) -> (int32[@unboxed]) -> (int64[@unboxed])
= "caml_cc_hash_combine_i32_byte" "caml_cc_hash_combine_i32"
[@@noalloc]
(** [combine_i32 state chunk] mixes [chunk] into [state]. *)
external combine_char : (int64[@unboxed]) -> (int[@untagged]) -> (int64[@unboxed])
= "caml_cc_hash_combine_char_byte" "caml_cc_hash_combine_char"
[@@noalloc]
(** [combine_char state c] mixes character code [c] into [state]. *)
external combine_string : (int64[@unboxed]) -> string -> (int64[@unboxed])
= "caml_cc_hash_combine_string_byte" "caml_cc_hash_combine_string"
[@@noalloc]
(** [combine_string state s] mixes all bytes of [s] into [state] in 8-byte chunks. *)
external fmix64 : (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_hash_fmix64_byte" "caml_cc_hash_fmix64"
[@@noalloc]
(** [fmix64 state] applies the Murmur3 finalizer. Result may be negative. *)
external finalize : (int64[@unboxed]) -> (int[@untagged])
= "caml_cc_hash_finalize_byte" "caml_cc_hash_finalize"
[@@noalloc]
(** [finalize state] applies fmix64 and returns a non-negative [int]. *)

View file

@ -6,7 +6,12 @@
(action
(run %{project_root}/src/core/cpp/cpp.exe %{input-file})))
(flags :standard -nolabels -open CCMonomorphic)
(libraries either containers.monomorphic containers.domain))
(libraries either containers.monomorphic containers.domain)
(private_modules Hash_impl_)
(foreign_stubs
(language c)
(flags :standard -O2)
(names hash_stubs)))
(ocamllex
(modules CCSexp_lex))

148
src/core/hash_stubs.c Normal file
View file

@ -0,0 +1,148 @@
/* This file is free software, part of containers. See file "license" for more details. */
/* Hash implementation: xorshift+multiply combiner with fmix64 finalizer.
Combiner: state ^= chunk; state ^= state >> 32; state *= 0xd6e8feb86659fd93
Finalizer (fmix64, Murmur3): three rounds of xorshift-multiply.
Multiplicative constant 0xd6e8feb86659fd93 (rrmxmx family, Pelle Evensen, 2018):
https://mostlymangling.blogspot.com/2018/07/on-mixing-functions-in-fast-hashing.html
Also evaluated in Chris Wellons' hash-prospector:
https://github.com/skeeto/hash-prospector
fmix64 constants 0xff51afd7ed558ccd / 0xc4ceb9fe1a85ec53 (Murmur3, Austin Appleby):
https://github.com/aappleby/smhasher
*/
#include <caml/mlvalues.h>
#include <caml/alloc.h>
#include <caml/memory.h>
#include <stdint.h>
#include <string.h>
#define HASH_MUL UINT64_C(0xd6e8feb86659fd93)
#define FMIX_C1 UINT64_C(0xff51afd7ed558ccd)
#define FMIX_C2 UINT64_C(0xc4ceb9fe1a85ec53)
static inline uint64_t hash_combine(uint64_t state, uint64_t chunk)
{
state ^= chunk;
state ^= state >> 32;
state *= HASH_MUL;
return state;
}
static inline uint64_t fmix64(uint64_t h)
{
h ^= h >> 33;
h *= FMIX_C1;
h ^= h >> 33;
h *= FMIX_C2;
h ^= h >> 33;
return h;
}
/* --- combine_i64 --------------------------------------------------------- */
CAMLprim int64_t caml_cc_hash_combine_i64(int64_t state, int64_t chunk)
{
return (int64_t)hash_combine((uint64_t)state, (uint64_t)chunk);
}
CAMLprim value caml_cc_hash_combine_i64_byte(value v_state, value v_chunk)
{
CAMLparam2(v_state, v_chunk);
uint64_t r = hash_combine((uint64_t)Int64_val(v_state),
(uint64_t)Int64_val(v_chunk));
CAMLreturn(caml_copy_int64((int64_t)r));
}
/* --- combine_i32 --------------------------------------------------------- */
CAMLprim int64_t caml_cc_hash_combine_i32(int64_t state, int32_t chunk)
{
return (int64_t)hash_combine((uint64_t)state, (uint64_t)(uint32_t)chunk);
}
CAMLprim value caml_cc_hash_combine_i32_byte(value v_state, value v_chunk)
{
CAMLparam2(v_state, v_chunk);
uint64_t r = hash_combine((uint64_t)Int64_val(v_state),
(uint64_t)(uint32_t)Int32_val(v_chunk));
CAMLreturn(caml_copy_int64((int64_t)r));
}
/* --- combine_char -------------------------------------------------------- */
/* c is passed as untagged int (Char.code) */
CAMLprim int64_t caml_cc_hash_combine_char(int64_t state, intnat c)
{
return (int64_t)hash_combine((uint64_t)state, (uint64_t)(unsigned char)c);
}
CAMLprim value caml_cc_hash_combine_char_byte(value v_state, value v_c)
{
CAMLparam2(v_state, v_c);
uint64_t r = hash_combine((uint64_t)Int64_val(v_state),
(uint64_t)(unsigned char)Long_val(v_c));
CAMLreturn(caml_copy_int64((int64_t)r));
}
/* --- combine_string ------------------------------------------------------ */
/* Hashes all bytes of [str] into [state] using 8-byte chunks where possible.
[str] is a regular OCaml value; [state] is unboxed int64. */
CAMLprim int64_t caml_cc_hash_combine_string(int64_t state, value str)
{
const char *data = String_val(str);
mlsize_t len = caml_string_length(str);
uint64_t s = (uint64_t)state;
mlsize_t i = 0;
for (; i + 8 <= len; i += 8) {
uint64_t chunk;
memcpy(&chunk, data + i, 8);
s = hash_combine(s, chunk);
}
if (i < len) {
uint64_t chunk = 0;
memcpy(&chunk, data + i, len - i);
s = hash_combine(s, chunk);
}
return (int64_t)s;
}
CAMLprim value caml_cc_hash_combine_string_byte(value v_state, value str)
{
CAMLparam2(v_state, str);
int64_t r = caml_cc_hash_combine_string(Int64_val(v_state), str);
CAMLreturn(caml_copy_int64(r));
}
/* --- fmix64 -------------------------------------------------------------- */
/* Returns full 64-bit fmix64 result; may be "negative" as signed int64. */
CAMLprim int64_t caml_cc_hash_fmix64(int64_t state)
{
return (int64_t)fmix64((uint64_t)state);
}
CAMLprim value caml_cc_hash_fmix64_byte(value v_state)
{
CAMLparam1(v_state);
CAMLreturn(caml_copy_int64((int64_t)fmix64((uint64_t)Int64_val(v_state))));
}
/* --- finalize ------------------------------------------------------------ */
/* Applies fmix64 and masks to Max_long (positive OCaml int). */
CAMLprim intnat caml_cc_hash_finalize(int64_t state)
{
return (intnat)(fmix64((uint64_t)state) & (uint64_t)Max_long);
}
CAMLprim value caml_cc_hash_finalize_byte(value v_state)
{
CAMLparam1(v_state);
intnat r = (intnat)(fmix64((uint64_t)Int64_val(v_state)) & (uint64_t)Max_long);
CAMLreturn(Val_long(r));
}