diff --git a/src/xxhash/containers_xxhash.ml b/src/xxhash/containers_xxhash.ml index f6e694b7..b30a8323 100644 --- a/src/xxhash/containers_xxhash.ml +++ b/src/xxhash/containers_xxhash.ml @@ -1,37 +1,20 @@ -type state = int64 - -let seed : state = 0L - -external mix_int64 : (state[@unboxed]) -> (int64[@unboxed]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int64_byte" "caml_cc_xxhash_mix_int64" +external hash_string : string -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_string_byte" "caml_cc_xxhash_string" [@@noalloc] -external mix_int : (state[@unboxed]) -> (int[@untagged]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int_byte" "caml_cc_xxhash_mix_int" +external hash_int64 : + (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64" [@@noalloc] -external mix_int32 : (state[@unboxed]) -> (int32[@unboxed]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int32_byte" "caml_cc_xxhash_mix_int32" +external hash_int32 : + (int32[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int32_byte" "caml_cc_xxhash_int32" [@@noalloc] -let[@inline] mix_bool h b = mix_int h (Bool.to_int b) -let[@inline] mix_char h c = mix_int h (Char.code c) -let[@inline] mix_float h f = mix_int64 h (Int64.bits_of_float f) - -external mix_string_aux : (state[@unboxed]) -> string -> (state[@unboxed]) - = "caml_cc_xxhash_mix_string_byte" "caml_cc_xxhash_mix_string" +external hash_int : (int[@untagged]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int" [@@noalloc] -let[@inline] mix_string h s = mix_string_aux h s - -external finalize : (state[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_finalize_byte" "caml_cc_xxhash_finalize" -[@@noalloc] - -let[@inline] hash_string ?(seed = seed) s = finalize (mix_string seed s) -let[@inline] hash_int64 ?(seed = seed) v = finalize (mix_int64 seed v) -let[@inline] hash_int ?(seed = seed) v = finalize (mix_int seed v) -let[@inline] hash_int32 ?(seed = seed) v = finalize (mix_int32 seed v) -let[@inline] hash_bool ?(seed = seed) b = finalize (mix_bool seed b) -let[@inline] hash_char ?(seed = seed) c = finalize (mix_char seed c) -let[@inline] hash_float ?(seed = seed) f = finalize (mix_float seed f) +let[@inline] hash_bool b seed = hash_int (Bool.to_int b) seed +let[@inline] hash_char c seed = hash_int (Char.code c) seed diff --git a/src/xxhash/containers_xxhash.mli b/src/xxhash/containers_xxhash.mli index 5a040c6b..b9bb1658 100644 --- a/src/xxhash/containers_xxhash.mli +++ b/src/xxhash/containers_xxhash.mli @@ -3,76 +3,34 @@ Fast non-cryptographic hash functions from {{:https://github.com/Cyan4973/xxHash} xxHash}. - Hashing uses XXH64. To hash a single value use the [hash_foo] convenience - functions. To combine several values, fold with [mix_*] and call - {!finalize}: - - {[ - let h = - seed - |> fun h -> mix_string h "hello" - |> fun h -> mix_int h 42 - |> finalize - ]} + All functions use XXH64 and are noalloc in native code. *) -type state = private int64 -(** Accumulated hash state. Represented as [int64] so the compiler can unbox - it at call sites. *) - -val seed : state -(** Initial state. Equal to [0L]. *) - -external mix_int64 : (state[@unboxed]) -> (int64[@unboxed]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int64_byte" "caml_cc_xxhash_mix_int64" +external hash_string : string -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_string_byte" "caml_cc_xxhash_string" [@@noalloc] -(** Mix an [int64] value into the state. Noalloc and unboxed in native code. *) +(** [hash_string s seed] hashes [s] with [seed] using XXH64. *) -external mix_int : (state[@unboxed]) -> (int[@untagged]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int_byte" "caml_cc_xxhash_mix_int" +external hash_int64 : + (int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64" [@@noalloc] -(** Mix an [int] value into the state. Noalloc and untagged in native code. *) +(** [hash_int64 v seed] hashes the 8-byte representation of [v] with [seed]. *) -external mix_int32 : (state[@unboxed]) -> (int32[@unboxed]) -> (state[@unboxed]) - = "caml_cc_xxhash_mix_int32_byte" "caml_cc_xxhash_mix_int32" +external hash_int32 : + (int32[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int32_byte" "caml_cc_xxhash_int32" [@@noalloc] -(** Mix an [int32] value into the state. Noalloc and unboxed in native code. *) +(** [hash_int32 v seed] hashes the 4-byte representation of [v] with [seed]. *) -val mix_bool : state -> bool -> state -(** Mix a [bool] into the state. *) - -val mix_char : state -> char -> state -(** Mix a [char] into the state. *) - -val mix_float : state -> float -> state -(** Mix a [float] into the state via [Int64.bits_of_float]. *) - -val mix_string : state -> string -> state -(** Mix a [string] into the state using XXH64. *) - -external finalize : (state[@unboxed]) -> (int64[@unboxed]) - = "caml_cc_xxhash_finalize_byte" "caml_cc_xxhash_finalize" +external hash_int : (int[@untagged]) -> (int64[@unboxed]) -> (int64[@unboxed]) + = "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int" [@@noalloc] -(** Finalise the accumulated state into a 64-bit hash. Noalloc and unboxed in - native code. *) +(** [hash_int v seed] hashes [v] as a 64-bit integer with [seed]. + Noalloc and untagged in native code. *) -val hash_string : ?seed:state -> string -> int64 -(** [hash_string ?seed s] is [finalize (mix_string seed s)]. *) +val hash_bool : bool -> int64 -> int64 +(** [hash_bool b seed] hashes [b] as an integer (0 or 1) with [seed]. *) -val hash_int64 : ?seed:state -> int64 -> int64 -(** [hash_int64 ?seed v] is [finalize (mix_int64 seed v)]. *) - -val hash_int : ?seed:state -> int -> int64 -(** [hash_int ?seed v] is [finalize (mix_int seed v)]. *) - -val hash_int32 : ?seed:state -> int32 -> int64 -(** [hash_int32 ?seed v] is [finalize (mix_int32 seed v)]. *) - -val hash_bool : ?seed:state -> bool -> int64 -(** [hash_bool ?seed b] is [finalize (mix_bool seed b)]. *) - -val hash_char : ?seed:state -> char -> int64 -(** [hash_char ?seed c] is [finalize (mix_char seed c)]. *) - -val hash_float : ?seed:state -> float -> int64 -(** [hash_float ?seed f] is [finalize (mix_float seed f)]. *) +val hash_char : char -> int64 -> int64 +(** [hash_char c seed] hashes [c] as its character code with [seed]. *) diff --git a/src/xxhash/stubs.c b/src/xxhash/stubs.c index 928d7305..53ce3593 100644 --- a/src/xxhash/stubs.c +++ b/src/xxhash/stubs.c @@ -8,55 +8,45 @@ #include #include -/* mix_int64: (int64_t state, int64_t value) -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_mix_int64(int64_t state, int64_t value) { - return (int64_t)XXH64(&value, sizeof(value), (XXH64_hash_t)state); +/* hash_string: (value string, int64_t seed) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_string(value v_s, int64_t seed) { + return (int64_t)XXH64(String_val(v_s), caml_string_length(v_s), + (XXH64_hash_t)seed); } -CAMLprim value caml_cc_xxhash_mix_int64_byte(value v_state, value v_value) { - CAMLparam2(v_state, v_value); - int64_t result = caml_cc_xxhash_mix_int64(Int64_val(v_state), Int64_val(v_value)); +CAMLprim value caml_cc_xxhash_string_byte(value v_s, value v_seed) { + CAMLparam2(v_s, v_seed); + int64_t result = + caml_cc_xxhash_string(v_s, (int64_t)Int64_val(v_seed)); CAMLreturn(caml_copy_int64(result)); } -/* mix_int: (int64_t state, intnat value) -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_mix_int(int64_t state, intnat value) { - int64_t v = (int64_t)(uintnat)value; /* zero-extend on 32-bit platforms */ - return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)state); +/* hash_int64: (int64_t v, int64_t seed) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_int64(int64_t v, int64_t seed) { + return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)seed); } -CAMLprim value caml_cc_xxhash_mix_int_byte(value v_state, value v_value) { - CAMLparam2(v_state, v_value); - int64_t result = caml_cc_xxhash_mix_int(Int64_val(v_state), Long_val(v_value)); +CAMLprim value caml_cc_xxhash_int64_byte(value v_v, value v_seed) { + CAMLparam2(v_v, v_seed); + int64_t result = caml_cc_xxhash_int64(Int64_val(v_v), Int64_val(v_seed)); CAMLreturn(caml_copy_int64(result)); } -/* mix_int32: (int64_t state, int32_t value) -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_mix_int32(int64_t state, int32_t value) { - int64_t v = (int64_t)value; - return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)state); +/* hash_int32: (int32_t v, int64_t seed) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_int32(int32_t v, int64_t seed) { + return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)seed); } -CAMLprim value caml_cc_xxhash_mix_int32_byte(value v_state, value v_value) { - CAMLparam2(v_state, v_value); - int64_t result = caml_cc_xxhash_mix_int32(Int64_val(v_state), Int32_val(v_value)); +CAMLprim value caml_cc_xxhash_int32_byte(value v_v, value v_seed) { + CAMLparam2(v_v, v_seed); + int64_t result = caml_cc_xxhash_int32(Int32_val(v_v), Int64_val(v_seed)); CAMLreturn(caml_copy_int64(result)); } -/* mix_string: native signature: (int64_t state, value string) -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_mix_string(int64_t state, value v_s) { - const char *s = String_val(v_s); - size_t len = caml_string_length(v_s); - return (int64_t)XXH64(s, len, (XXH64_hash_t)state); +/* hash_int: (intnat v, int64_t seed) -> int64_t */ +CAMLprim int64_t caml_cc_xxhash_int(intnat v, int64_t seed) { + int64_t v64 = (int64_t)(uintnat)v; /* zero-extend on 32-bit platforms */ + return (int64_t)XXH64(&v64, sizeof(v64), (XXH64_hash_t)seed); } -CAMLprim value caml_cc_xxhash_mix_string_byte(value v_state, value v_s) { - CAMLparam2(v_state, v_s); - int64_t result = caml_cc_xxhash_mix_string(Int64_val(v_state), v_s); +CAMLprim value caml_cc_xxhash_int_byte(value v_v, value v_seed) { + CAMLparam2(v_v, v_seed); + int64_t result = caml_cc_xxhash_int(Long_val(v_v), Int64_val(v_seed)); CAMLreturn(caml_copy_int64(result)); } - -/* finalize: int64_t state -> int64_t */ -CAMLprim int64_t caml_cc_xxhash_finalize(int64_t state) { - return (int64_t)XXH64(&state, sizeof(state), 0); -} -CAMLprim value caml_cc_xxhash_finalize_byte(value v_state) { - CAMLparam1(v_state); - CAMLreturn(caml_copy_int64(caml_cc_xxhash_finalize(Int64_val(v_state)))); -} diff --git a/tests/xxhash/t_xxhash.ml b/tests/xxhash/t_xxhash.ml index 4d317f82..81692294 100644 --- a/tests/xxhash/t_xxhash.ml +++ b/tests/xxhash/t_xxhash.ml @@ -1,141 +1,136 @@ include (val Containers_testlib.make ~__FILE__ ()) module H = Containers_xxhash -(* Gold tests: hash_string with XXH64 (mix_string + finalize) *) +(* Gold tests: hash_string with seed=0 *) ;; t @@ fun () -> -assert_equal ~printer:Int64.to_string (-8037231448521241007L) - (H.hash_string ~seed:H.seed ""); -assert_equal ~printer:Int64.to_string 7619381941762342490L - (H.hash_string ~seed:H.seed "a"); -assert_equal ~printer:Int64.to_string 8482916093137399771L - (H.hash_string ~seed:H.seed "hello"); -assert_equal ~printer:Int64.to_string (-3052030864281505429L) - (H.hash_string ~seed:H.seed "hello, world!"); -assert_equal ~printer:Int64.to_string 2707297459162763210L - (H.hash_string ~seed:H.seed "the quick brown fox"); +assert_equal ~printer:Int64.to_string (-1205034819632174695L) + (H.hash_string "" 0L); +assert_equal ~printer:Int64.to_string (-7444071767201028348L) + (H.hash_string "" 42L); +assert_equal ~printer:Int64.to_string 2794345569481354659L + (H.hash_string "hello" 0L); +assert_equal ~printer:Int64.to_string (-4367754540140381902L) + (H.hash_string "hello" 42L); +assert_equal ~printer:Int64.to_string 1513236774081638803L + (H.hash_string "the quick brown fox" 0L); +assert_equal ~printer:Int64.to_string 6882318601984224800L + (H.hash_string "the quick brown fox" 42L); true ;; -(* Gold tests: hash_string with non-default seed (seed from mix_int) *) +(* Gold tests: hash_int64 with seed=0 *) t @@ fun () -> -(* seed after mixing 1 into seed=0: hash_string uses that as XXH64 seed *) -let seed1 = H.mix_int H.seed 1 in -(* these values computed from: finalize(mix_string(mix_int(0,1), s)) *) -assert_equal ~printer:Int64.to_string - (H.hash_string ~seed:seed1 "") - (H.hash_string ~seed:seed1 ""); -(* just test determinism with custom seed *) -assert_equal ~printer:Int64.to_string - (H.hash_string ~seed:seed1 "hello") - (H.hash_string ~seed:seed1 "hello"); -(* different seeds produce different hashes for same string *) -assert ( - not - (Int64.equal - (H.hash_string ~seed:H.seed "hello") - (H.hash_string ~seed:seed1 "hello"))); +assert_equal ~printer:Int64.to_string 3803688792395291579L (H.hash_int64 0L 0L); +assert_equal ~printer:Int64.to_string (-6977822845260490347L) + (H.hash_int64 1L 0L); +assert_equal ~printer:Int64.to_string (-8804195676797548855L) + (H.hash_int64 (-1L) 0L); +assert_equal ~printer:Int64.to_string (-7296932117151183542L) + (H.hash_int64 1234567890123456789L 0L); true ;; -(* Gold tests: hash_string default seed=0 *) +(* Gold tests: hash_int32 with seed=0 *) t @@ fun () -> -assert_equal ~printer:Int64.to_string (-8037231448521241007L) (H.hash_string ""); -assert_equal ~printer:Int64.to_string 7619381941762342490L (H.hash_string "a"); -assert_equal ~printer:Int64.to_string 8482916093137399771L - (H.hash_string "hello"); +assert_equal ~printer:Int64.to_string 4246796580750024372L (H.hash_int32 0l 0L); +assert_equal ~printer:Int64.to_string (-851299076295404719L) + (H.hash_int32 1l 0L); +assert_equal ~printer:Int64.to_string 9185342943168159635L + (H.hash_int32 (-1l) 0L); +assert_equal ~printer:Int64.to_string (-2929917330072466447L) + (H.hash_int32 42l 0L); true ;; -(* Gold tests: hash_int64 *) +(* Gold tests: hash_int with seed=0 *) t @@ fun () -> -assert_equal ~printer:Int64.to_string (-5605595894618674504L) (H.hash_int64 0L); -assert_equal ~printer:Int64.to_string 7046788939542163588L (H.hash_int64 1L); -assert_equal ~printer:Int64.to_string 2627184251037003377L (H.hash_int64 42L); -assert_equal ~printer:Int64.to_string (-8629399683307595115L) - (H.hash_int64 (-1L)); -assert_equal ~printer:Int64.to_string 8147024165990365903L - (H.hash_int64 1234567890123456789L); -true -;; - -(* Gold tests: hash_int *) -t @@ fun () -> -assert_equal ~printer:Int64.to_string (-5605595894618674504L) (H.hash_int 0); -assert_equal ~printer:Int64.to_string 7046788939542163588L (H.hash_int 1); -assert_equal ~printer:Int64.to_string 2627184251037003377L (H.hash_int 42); -assert_equal ~printer:Int64.to_string (-8629399683307595115L) (H.hash_int (-1)); -assert_equal ~printer:Int64.to_string (-3317520227865190253L) - (H.hash_int 1234567890); -true -;; - -(* Gold tests: hash_int32 *) -t @@ fun () -> -assert_equal ~printer:Int64.to_string (-5605595894618674504L) (H.hash_int32 0l); -assert_equal ~printer:Int64.to_string 7046788939542163588L (H.hash_int32 1l); -assert_equal ~printer:Int64.to_string 2627184251037003377L (H.hash_int32 42l); -assert_equal ~printer:Int64.to_string (-8629399683307595115L) - (H.hash_int32 (-1l)); -assert_equal ~printer:Int64.to_string (-3317520227865190253L) - (H.hash_int32 1234567890l); +assert_equal ~printer:Int64.to_string 3803688792395291579L (H.hash_int 0 0L); +assert_equal ~printer:Int64.to_string (-6977822845260490347L) (H.hash_int 1 0L); +assert_equal ~printer:Int64.to_string (-8804195676797548855L) + (H.hash_int (-1) 0L); +assert_equal ~printer:Int64.to_string (-5379971487550586029L) (H.hash_int 42 0L); true ;; (* Gold tests: hash_bool *) t @@ fun () -> -assert_equal ~printer:Int64.to_string (-5605595894618674504L) - (H.hash_bool false); -assert_equal ~printer:Int64.to_string 7046788939542163588L (H.hash_bool true); +assert_equal ~printer:Int64.to_string 3803688792395291579L + (H.hash_bool false 0L); +assert_equal ~printer:Int64.to_string (-6977822845260490347L) + (H.hash_bool true 0L); true ;; (* Gold tests: hash_char *) t @@ fun () -> -assert_equal ~printer:Int64.to_string (-1595464024050301112L) (H.hash_char 'a'); -assert_equal ~printer:Int64.to_string (-2980224328396984668L) (H.hash_char 'z'); -assert_equal ~printer:Int64.to_string 7387411195422956975L (H.hash_char '0'); -true -;; - -(* Gold tests: finalize(seed) = finalize(0L) = XXH64(&0, 8, 0) *) -t @@ fun () -> -assert_equal ~printer:Int64.to_string 3803688792395291579L (H.finalize H.seed); -(* finalize is deterministic *) -assert_equal ~printer:Int64.to_string - (H.finalize (H.mix_int64 H.seed 42L)) - (H.finalize (H.mix_int64 H.seed 42L)); +(* 'a' = 97 *) +assert_equal ~printer:Int64.to_string (H.hash_int 97 0L) (H.hash_char 'a' 0L); +(* '0' = 48 *) +assert_equal ~printer:Int64.to_string (H.hash_int 48 0L) (H.hash_char '0' 0L); true ;; (* Property tests: determinism *) q ~count:10_000 Q.string @@ fun s -> -Int64.equal (H.hash_string s) (H.hash_string s) +Int64.equal (H.hash_string s 0L) (H.hash_string s 0L) ;; q ~count:10_000 Q.int64 @@ fun v -> -Int64.equal (H.hash_int64 v) (H.hash_int64 v) +Int64.equal (H.hash_int64 v 0L) (H.hash_int64 v 0L) ;; -q ~count:10_000 Q.int @@ fun v -> Int64.equal (H.hash_int v) (H.hash_int v);; -q ~count:10_000 Q.bool @@ fun b -> Int64.equal (H.hash_bool b) (H.hash_bool b);; -q ~count:10_000 Q.char @@ fun c -> Int64.equal (H.hash_char c) (H.hash_char c);; +q ~count:10_000 Q.int @@ fun v -> +Int64.equal (H.hash_int v 0L) (H.hash_int v 0L) +;; + +q ~count:10_000 Q.bool @@ fun b -> +Int64.equal (H.hash_bool b 0L) (H.hash_bool b 0L) +;; + +q ~count:10_000 Q.char @@ fun c -> +Int64.equal (H.hash_char c 0L) (H.hash_char c 0L) +;; + +(* Different seeds give different results for the same input (seed nonzero) *) +q ~count:10_000 (Q.pair Q.string Q.int64) @@ fun (s, seed) -> +Q.assume (not (Int64.equal seed 0L)); +not (Int64.equal (H.hash_string s 0L) (H.hash_string s seed)) +;; + +q ~count:10_000 (Q.pair Q.int64 Q.int64) @@ fun (v, seed) -> +Q.assume (not (Int64.equal seed 0L)); +not (Int64.equal (H.hash_int64 v 0L) (H.hash_int64 v seed)) +;; + +q ~count:10_000 (Q.pair Q.int Q.int64) @@ fun (v, seed) -> +Q.assume (not (Int64.equal seed 0L)); +not (Int64.equal (H.hash_int v 0L) (H.hash_int v seed)) +;; + +(* Different inputs give different results for the same seed *) +q ~count:10_000 (Q.pair Q.string Q.string) @@ fun (s1, s2) -> +Q.assume (not (String.equal s1 s2)); +not (Int64.equal (H.hash_string s1 0L) (H.hash_string s2 0L)) +;; -(* mix_int64 is not commutative for most pairs *) q ~count:10_000 (Q.pair Q.int64 Q.int64) @@ fun (a, b) -> Q.assume (not (Int64.equal a b)); -let ab = H.finalize (H.mix_int64 (H.mix_int64 H.seed a) b) in -let ba = H.finalize (H.mix_int64 (H.mix_int64 H.seed b) a) in -not (Int64.equal ab ba) +not (Int64.equal (H.hash_int64 a 0L) (H.hash_int64 b 0L)) ;; -(* Stress test: hash many strings, non-empty => non-zero *) +q ~count:10_000 (Q.pair Q.int Q.int) @@ fun (a, b) -> +Q.assume (a <> b); +not (Int64.equal (H.hash_int a 0L) (H.hash_int b 0L)) +;; + +(* Stress test: hash 100k strings of varying lengths, non-empty => non-zero *) t @@ fun () -> for len = 0 to 99 do for _ = 1 to 1000 do let s = String.make len 'x' in - let h = H.mix_string H.seed s |> H.finalize in + let h = H.hash_string s 0L in if len > 0 then if Int64.equal h 0L then failwith