From e52574c5fb7dd063c579b3a8c16473d8b903fb3d Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Sat, 14 Mar 2026 03:34:24 +0000 Subject: [PATCH] revamp xxhash API --- src/xxhash/containers_xxhash.ml | 41 ++++++++++----- src/xxhash/containers_xxhash.mli | 88 ++++++++++++++++++++++--------- src/xxhash/stubs.c | 89 ++++++++++++++------------------ 3 files changed, 130 insertions(+), 88 deletions(-) diff --git a/src/xxhash/containers_xxhash.ml b/src/xxhash/containers_xxhash.ml index 486bcf7b..f6e694b7 100644 --- a/src/xxhash/containers_xxhash.ml +++ b/src/xxhash/containers_xxhash.ml @@ -1,22 +1,37 @@ -external hash_string_aux : string -> (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_string_byte" "caml_cc_xxhash_string" +type state = int64 + +let seed : state = 0L + +external mix_int64 : (state[@unboxed]) -> (int64[@unboxed]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int64_byte" "caml_cc_xxhash_mix_int64" [@@noalloc] -let[@inline] hash_string ?(seed = 0L) s = hash_string_aux s seed - -external hash_int64 : - (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64" +external mix_int : (state[@unboxed]) -> (int[@untagged]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int_byte" "caml_cc_xxhash_mix_int" [@@noalloc] -external hash_int : (int[@untagged]) -> (int[@untagged]) -> (int[@untagged]) - = "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int" +external mix_int32 : (state[@unboxed]) -> (int32[@unboxed]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int32_byte" "caml_cc_xxhash_mix_int32" [@@noalloc] -external mix64 : (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_mix64_byte" "caml_cc_xxhash_mix64" +let[@inline] mix_bool h b = mix_int h (Bool.to_int b) +let[@inline] mix_char h c = mix_int h (Char.code c) +let[@inline] mix_float h f = mix_int64 h (Int64.bits_of_float f) + +external mix_string_aux : (state[@unboxed]) -> string -> (state[@unboxed]) + = "caml_cc_xxhash_mix_string_byte" "caml_cc_xxhash_mix_string" [@@noalloc] -external finalize64 : (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_finalize64_byte" "caml_cc_xxhash_finalize64" +let[@inline] mix_string h s = mix_string_aux h s + +external finalize : (state[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_finalize_byte" "caml_cc_xxhash_finalize" [@@noalloc] + +let[@inline] hash_string ?(seed = seed) s = finalize (mix_string seed s) +let[@inline] hash_int64 ?(seed = seed) v = finalize (mix_int64 seed v) +let[@inline] hash_int ?(seed = seed) v = finalize (mix_int seed v) +let[@inline] hash_int32 ?(seed = seed) v = finalize (mix_int32 seed v) +let[@inline] hash_bool ?(seed = seed) b = finalize (mix_bool seed b) +let[@inline] hash_char ?(seed = seed) c = finalize (mix_char seed c) +let[@inline] hash_float ?(seed = seed) f = finalize (mix_float seed f) diff --git a/src/xxhash/containers_xxhash.mli b/src/xxhash/containers_xxhash.mli index e32604bd..5a040c6b 100644 --- a/src/xxhash/containers_xxhash.mli +++ b/src/xxhash/containers_xxhash.mli @@ -3,36 +3,76 @@ Fast non-cryptographic hash functions from {{:https://github.com/Cyan4973/xxHash} xxHash}. - String hashing uses XXH3_64bits (modern, fastest). - Integer hashing delegates to the string hasher via a stack-allocated buffer. - The mixer and finalizer use the XXH64 primitive. + Hashing uses XXH64. To hash a single value use the [hash_foo] convenience + functions. To combine several values, fold with [mix_*] and call + {!finalize}: + + {[ + let h = + seed + |> fun h -> mix_string h "hello" + |> fun h -> mix_int h 42 + |> finalize + ]} *) -val hash_string : ?seed:int64 -> string -> int64 -(** [hash_string ?seed s] hashes string [s] with optional [seed] (default [0L]) - using XXH3_64bits_withSeed. *) +type state = private int64 +(** Accumulated hash state. Represented as [int64] so the compiler can unbox + it at call sites. *) -external hash_int64 : - (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64" -[@@noalloc] -(** [hash_int64 v seed] hashes [v] with [seed] using XXH3_64bits_withSeed. - Noalloc and unboxed in native code. *) +val seed : state +(** Initial state. Equal to [0L]. *) -external hash_int : (int[@untagged]) -> (int[@untagged]) -> (int[@untagged]) - = "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int" +external mix_int64 : (state[@unboxed]) -> (int64[@unboxed]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int64_byte" "caml_cc_xxhash_mix_int64" [@@noalloc] -(** [hash_int v seed] hashes [v] (an OCaml int) with [seed]. - Noalloc and untagged in native code. *) +(** Mix an [int64] value into the state. Noalloc and unboxed in native code. *) -external mix64 : (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_mix64_byte" "caml_cc_xxhash_mix64" +external mix_int : (state[@unboxed]) -> (int[@untagged]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int_byte" "caml_cc_xxhash_mix_int" [@@noalloc] -(** [mix64 a b] mixes two int64 values using XXH64: [XXH64(&a, 8, b)]. - Suitable for combining hash values. Noalloc and unboxed in native code. *) +(** Mix an [int] value into the state. Noalloc and untagged in native code. *) -external finalize64 : (int64[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_finalize64_byte" "caml_cc_xxhash_finalize64" +external mix_int32 : (state[@unboxed]) -> (int32[@unboxed]) -> (state[@unboxed]) + = "caml_cc_xxhash_mix_int32_byte" "caml_cc_xxhash_mix_int32" [@@noalloc] -(** [finalize64 h] finalizes/avalanches a hash value using XXH64: [XXH64(&h, 8, 0)]. - Noalloc and unboxed in native code. *) +(** Mix an [int32] value into the state. Noalloc and unboxed in native code. *) + +val mix_bool : state -> bool -> state +(** Mix a [bool] into the state. *) + +val mix_char : state -> char -> state +(** Mix a [char] into the state. *) + +val mix_float : state -> float -> state +(** Mix a [float] into the state via [Int64.bits_of_float]. *) + +val mix_string : state -> string -> state +(** Mix a [string] into the state using XXH64. *) + +external finalize : (state[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_finalize_byte" "caml_cc_xxhash_finalize" +[@@noalloc] +(** Finalise the accumulated state into a 64-bit hash. Noalloc and unboxed in + native code. *) + +val hash_string : ?seed:state -> string -> int64 +(** [hash_string ?seed s] is [finalize (mix_string seed s)]. *) + +val hash_int64 : ?seed:state -> int64 -> int64 +(** [hash_int64 ?seed v] is [finalize (mix_int64 seed v)]. *) + +val hash_int : ?seed:state -> int -> int64 +(** [hash_int ?seed v] is [finalize (mix_int seed v)]. *) + +val hash_int32 : ?seed:state -> int32 -> int64 +(** [hash_int32 ?seed v] is [finalize (mix_int32 seed v)]. *) + +val hash_bool : ?seed:state -> bool -> int64 +(** [hash_bool ?seed b] is [finalize (mix_bool seed b)]. *) + +val hash_char : ?seed:state -> char -> int64 +(** [hash_char ?seed c] is [finalize (mix_char seed c)]. *) + +val hash_float : ?seed:state -> float -> int64 +(** [hash_float ?seed f] is [finalize (mix_float seed f)]. *) diff --git a/src/xxhash/stubs.c b/src/xxhash/stubs.c index 641bd18f..339b3806 100644 --- a/src/xxhash/stubs.c +++ b/src/xxhash/stubs.c @@ -8,68 +8,55 @@ #include #include -/* hash_string: native signature: (value, int64_t) -> int64_t - string is passed as OCaml value (can't be unboxed), seed is unboxed int64 */ -CAMLprim int64_t caml_cc_xxhash_string(value v_s, int64_t seed) { - const char *s = String_val(v_s); - size_t len = caml_string_length(v_s); - return (int64_t)XXH64(s, len, (XXH64_hash_t)seed); +/* mix_int64: (int64_t state, int64_t value) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_mix_int64(int64_t state, int64_t value) { + return (int64_t)XXH64(&value, sizeof(value), (XXH64_hash_t)state); } - -CAMLprim value caml_cc_xxhash_string_byte(value v_s, value v_seed) { - CAMLparam2(v_s, v_seed); - int64_t seed = Int64_val(v_seed); - const char *s = String_val(v_s); - size_t len = caml_string_length(v_s); - int64_t result = (int64_t)XXH64(s, len, (XXH64_hash_t)seed); +CAMLprim value caml_cc_xxhash_mix_int64_byte(value v_state, value v_value) { + CAMLparam2(v_state, v_value); + int64_t result = caml_cc_xxhash_mix_int64(Int64_val(v_state), Int64_val(v_value)); CAMLreturn(caml_copy_int64(result)); } -/* hash_int64: unboxed (int64_t, int64_t) -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_int64(int64_t v, int64_t seed) { - return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)seed); +/* mix_int: (int64_t state, intnat value) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_mix_int(int64_t state, intnat value) { + int64_t v = (int64_t)value; + return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)state); } - -CAMLprim value caml_cc_xxhash_int64_byte(value v_v, value v_seed) { - CAMLparam2(v_v, v_seed); - int64_t v = Int64_val(v_v); - int64_t seed = Int64_val(v_seed); - int64_t result = caml_cc_xxhash_int64(v, seed); +CAMLprim value caml_cc_xxhash_mix_int_byte(value v_state, value v_value) { + CAMLparam2(v_state, v_value); + int64_t result = caml_cc_xxhash_mix_int(Int64_val(v_state), Long_val(v_value)); CAMLreturn(caml_copy_int64(result)); } -/* hash_int: untagged (intnat, intnat) -> intnat */ -CAMLprim intnat caml_cc_xxhash_int(intnat v, intnat seed) { - int64_t v64 = (int64_t)v; - int64_t seed64 = (int64_t)seed; - return (intnat)caml_cc_xxhash_int64(v64, seed64); +/* mix_int32: (int64_t state, int32_t value) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_mix_int32(int64_t state, int32_t value) { + int64_t v = (int64_t)value; + return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)state); +} +CAMLprim value caml_cc_xxhash_mix_int32_byte(value v_state, value v_value) { + CAMLparam2(v_state, v_value); + int64_t result = caml_cc_xxhash_mix_int32(Int64_val(v_state), Int32_val(v_value)); + CAMLreturn(caml_copy_int64(result)); } -CAMLprim value caml_cc_xxhash_int_byte(value v_v, value v_seed) { - intnat v = Long_val(v_v); - intnat seed = Long_val(v_seed); - return Val_long(caml_cc_xxhash_int(v, seed)); +/* mix_string: native signature: (int64_t state, value string) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_mix_string(int64_t state, value v_s) { + const char *s = String_val(v_s); + size_t len = caml_string_length(v_s); + return (int64_t)XXH64(s, len, (XXH64_hash_t)state); +} +CAMLprim value caml_cc_xxhash_mix_string_byte(value v_state, value v_s) { + CAMLparam2(v_state, v_s); + int64_t result = caml_cc_xxhash_mix_string(Int64_val(v_state), v_s); + CAMLreturn(caml_copy_int64(result)); } -/* mix64: unboxed (int64_t, int64_t) -> int64_t [uses XXH64] */ -CAMLprim int64_t caml_cc_xxhash_mix64(int64_t a, int64_t b) { - return (int64_t)XXH64(&a, sizeof(a), (XXH64_hash_t)b); +/* finalize: int64_t state -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_finalize(int64_t state) { + return (int64_t)XXH64(&state, sizeof(state), 0); } - -CAMLprim value caml_cc_xxhash_mix64_byte(value v_a, value v_b) { - CAMLparam2(v_a, v_b); - int64_t a = Int64_val(v_a); - int64_t b = Int64_val(v_b); - CAMLreturn(caml_copy_int64(caml_cc_xxhash_mix64(a, b))); -} - -/* finalize64: unboxed int64_t -> int64_t [uses XXH64 with seed=0] */ -CAMLprim int64_t caml_cc_xxhash_finalize64(int64_t h) { - return (int64_t)XXH64(&h, sizeof(h), 0); -} - -CAMLprim value caml_cc_xxhash_finalize64_byte(value v_h) { - CAMLparam1(v_h); - int64_t h = Int64_val(v_h); - CAMLreturn(caml_copy_int64(caml_cc_xxhash_finalize64(h))); +CAMLprim value caml_cc_xxhash_finalize_byte(value v_state) { + CAMLparam1(v_state); + CAMLreturn(caml_copy_int64(caml_cc_xxhash_finalize(Int64_val(v_state)))); }