Compare commits

..

9 commits

Author SHA1 Message Date
Simon Cruanes
aeae7c1039
Merge pull request #488 from c-cube/simon/xxhash
Some checks failed
format / format (push) Has been cancelled
Build and Test / build (push) Has been cancelled
add containers.xxhash
2026-03-14 20:29:20 -04:00
Simon Cruanes
92f58d84a0 tests 2026-03-14 04:04:36 +00:00
Simon Cruanes
9865a26c91 nicer API with and without seed 2026-03-14 04:04:32 +00:00
Simon Cruanes
a3c061a388 simplify xxhash, no streaming mode after all 2026-03-14 03:53:08 +00:00
Simon Cruanes
9fa87cd9aa fix 32bit platform bug 2026-03-14 03:48:06 +00:00
Simon Cruanes
d12a809658 update tests 2026-03-14 03:34:48 +00:00
Simon Cruanes
e52574c5fb revamp xxhash API 2026-03-14 03:34:35 +00:00
Simon Cruanes
418e0fbf7f xxhash tests 2026-03-14 03:14:19 +00:00
Simon Cruanes
a999f564d6 xxhash bindings 2026-03-14 03:09:31 +00:00
9 changed files with 7833 additions and 0 deletions

2
src/xxhash/README.md Normal file
View file

@ -0,0 +1,2 @@
xxhash code directly vendored from https://github.com/Cyan4973/xxHash/ and remains under BSD license, author Yann Collet.

View file

@ -0,0 +1,32 @@
module Raw = struct
external hash_string : string -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_string_byte" "caml_cc_xxhash_string"
[@@noalloc]
external hash_int64 :
(int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64"
[@@noalloc]
external hash_int32 :
(int32[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int32_byte" "caml_cc_xxhash_int32"
[@@noalloc]
external hash_int : (int[@untagged]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int"
[@@noalloc]
end
let[@inline] hash_string s = Raw.hash_string s 0L
let[@inline] hash_string_seed s seed = Raw.hash_string s seed
let[@inline] hash_int64 v = Raw.hash_int64 v 0L
let[@inline] hash_int64_seed v seed = Raw.hash_int64 v seed
let[@inline] hash_int32 v = Raw.hash_int32 v 0L
let[@inline] hash_int32_seed v seed = Raw.hash_int32 v seed
let[@inline] hash_int v = Raw.hash_int v 0L
let[@inline] hash_int_seed v seed = Raw.hash_int v seed
let[@inline] hash_bool b = Raw.hash_int (Bool.to_int b) 0L
let[@inline] hash_bool_seed b seed = Raw.hash_int (Bool.to_int b) seed
let[@inline] hash_char c = Raw.hash_int (Char.code c) 0L
let[@inline] hash_char_seed c seed = Raw.hash_int (Char.code c) seed

View file

@ -0,0 +1,69 @@
(** XXHash bindings.
Fast non-cryptographic hash functions from
{{:https://github.com/Cyan4973/xxHash} xxHash}.
All functions use XXH64 and are noalloc in native code.
*)
(** Raw bindings with explicit seed argument. *)
module Raw : sig
external hash_string : string -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_string_byte" "caml_cc_xxhash_string"
[@@noalloc]
(** [hash_string s seed] hashes [s] with [seed] using XXH64. *)
external hash_int64 :
(int64[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int64_byte" "caml_cc_xxhash_int64"
[@@noalloc]
(** [hash_int64 v seed] hashes the 8-byte representation of [v] with [seed]. *)
external hash_int32 :
(int32[@unboxed]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int32_byte" "caml_cc_xxhash_int32"
[@@noalloc]
(** [hash_int32 v seed] hashes the 4-byte representation of [v] with [seed]. *)
external hash_int : (int[@untagged]) -> (int64[@unboxed]) -> (int64[@unboxed])
= "caml_cc_xxhash_int_byte" "caml_cc_xxhash_int"
[@@noalloc]
(** [hash_int v seed] hashes [v] as a 64-bit integer with [seed]. Noalloc
and untagged in native code. *)
end
val hash_string : string -> int64
(** [hash_string s] hashes [s] using XXH64 with seed [0L]. *)
val hash_string_seed : string -> int64 -> int64
(** [hash_string_seed s seed] hashes [s] with an explicit seed. *)
val hash_int64 : int64 -> int64
(** [hash_int64 v] hashes the 8-byte representation of [v] with seed [0L]. *)
val hash_int64_seed : int64 -> int64 -> int64
(** [hash_int64_seed v seed] hashes [v] with an explicit seed. *)
val hash_int32 : int32 -> int64
(** [hash_int32 v] hashes the 4-byte representation of [v] with seed [0L]. *)
val hash_int32_seed : int32 -> int64 -> int64
(** [hash_int32_seed v seed] hashes [v] with an explicit seed. *)
val hash_int : int -> int64
(** [hash_int v] hashes [v] as a 64-bit integer with seed [0L]. *)
val hash_int_seed : int -> int64 -> int64
(** [hash_int_seed v seed] hashes [v] with an explicit seed. *)
val hash_bool : bool -> int64
(** [hash_bool b] hashes [b] as an integer (0 or 1) with seed [0L]. *)
val hash_bool_seed : bool -> int64 -> int64
(** [hash_bool_seed b seed] hashes [b] with an explicit seed. *)
val hash_char : char -> int64
(** [hash_char c] hashes [c] as its character code with seed [0L]. *)
val hash_char_seed : char -> int64 -> int64
(** [hash_char_seed c seed] hashes [c] with an explicit seed. *)

10
src/xxhash/dune Normal file
View file

@ -0,0 +1,10 @@
(library
(name containers_xxhash)
(public_name containers.xxhash)
(synopsis "xxHash bindings for containers")
(libraries containers)
(foreign_stubs
(language c)
(flags :standard -O2)
(names stubs))
(ocamlopt_flags :standard -inline 100))

52
src/xxhash/stubs.c Normal file
View file

@ -0,0 +1,52 @@
#define XXH_NO_XXH3
#define XXH_NO_STREAM
#define XXH_INLINE_ALL
#include "xxhash.h"
#include <caml/alloc.h>
#include <caml/memory.h>
#include <caml/mlvalues.h>
#include <stdint.h>
/* hash_string: (value string, int64_t seed) -> int64_t */
CAMLprim int64_t caml_cc_xxhash_string(value v_s, int64_t seed) {
return (int64_t)XXH64(String_val(v_s), caml_string_length(v_s),
(XXH64_hash_t)seed);
}
CAMLprim value caml_cc_xxhash_string_byte(value v_s, value v_seed) {
CAMLparam2(v_s, v_seed);
int64_t result =
caml_cc_xxhash_string(v_s, (int64_t)Int64_val(v_seed));
CAMLreturn(caml_copy_int64(result));
}
/* hash_int64: (int64_t v, int64_t seed) -> int64_t */
CAMLprim int64_t caml_cc_xxhash_int64(int64_t v, int64_t seed) {
return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)seed);
}
CAMLprim value caml_cc_xxhash_int64_byte(value v_v, value v_seed) {
CAMLparam2(v_v, v_seed);
int64_t result = caml_cc_xxhash_int64(Int64_val(v_v), Int64_val(v_seed));
CAMLreturn(caml_copy_int64(result));
}
/* hash_int32: (int32_t v, int64_t seed) -> int64_t */
CAMLprim int64_t caml_cc_xxhash_int32(int32_t v, int64_t seed) {
return (int64_t)XXH64(&v, sizeof(v), (XXH64_hash_t)seed);
}
CAMLprim value caml_cc_xxhash_int32_byte(value v_v, value v_seed) {
CAMLparam2(v_v, v_seed);
int64_t result = caml_cc_xxhash_int32(Int32_val(v_v), Int64_val(v_seed));
CAMLreturn(caml_copy_int64(result));
}
/* hash_int: (intnat v, int64_t seed) -> int64_t */
CAMLprim int64_t caml_cc_xxhash_int(intnat v, int64_t seed) {
int64_t v64 = (int64_t)(uintnat)v; /* zero-extend on 32-bit platforms */
return (int64_t)XXH64(&v64, sizeof(v64), (XXH64_hash_t)seed);
}
CAMLprim value caml_cc_xxhash_int_byte(value v_v, value v_seed) {
CAMLparam2(v_v, v_seed);
int64_t result = caml_cc_xxhash_int(Long_val(v_v), Int64_val(v_seed));
CAMLreturn(caml_copy_int64(result));
}

42
src/xxhash/xxhash.c Normal file
View file

@ -0,0 +1,42 @@
/*
* xxHash - Extremely Fast Hash algorithm
* Copyright (C) 2012-2023 Yann Collet
*
* BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php)
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following disclaimer
* in the documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* You can contact the author at:
* - xxHash homepage: https://www.xxhash.com
* - xxHash source repository: https://github.com/Cyan4973/xxHash
*/
/*
* xxhash.c instantiates functions defined in xxhash.h
*/
#define XXH_STATIC_LINKING_ONLY /* access advanced declarations */
#define XXH_IMPLEMENTATION /* access definitions */
#include "xxhash.h"

7490
src/xxhash/xxhash.h Normal file

File diff suppressed because it is too large Load diff

11
tests/xxhash/dune Normal file
View file

@ -0,0 +1,11 @@
(executable
(name t_xxhash)
(modules t_xxhash)
(libraries containers containers.xxhash containers_testlib qcheck-core))
(rule
(alias runtest)
(deps t_xxhash.exe)
(package containers)
(action
(run ./t_xxhash.exe)))

125
tests/xxhash/t_xxhash.ml Normal file
View file

@ -0,0 +1,125 @@
include (val Containers_testlib.make ~__FILE__ ())
module H = Containers_xxhash
(* Gold tests: hash_string *)
;;
t @@ fun () ->
assert_equal ~printer:Int64.to_string (-1205034819632174695L) (H.hash_string "");
assert_equal ~printer:Int64.to_string (-7444071767201028348L)
(H.hash_string_seed "" 42L);
assert_equal ~printer:Int64.to_string 2794345569481354659L
(H.hash_string "hello");
assert_equal ~printer:Int64.to_string (-4367754540140381902L)
(H.hash_string_seed "hello" 42L);
assert_equal ~printer:Int64.to_string 1513236774081638803L
(H.hash_string "the quick brown fox");
assert_equal ~printer:Int64.to_string 6882318601984224800L
(H.hash_string_seed "the quick brown fox" 42L);
true
;;
(* Gold tests: hash_int64 *)
t @@ fun () ->
assert_equal ~printer:Int64.to_string 3803688792395291579L (H.hash_int64 0L);
assert_equal ~printer:Int64.to_string (-6977822845260490347L) (H.hash_int64 1L);
assert_equal ~printer:Int64.to_string (-8804195676797548855L)
(H.hash_int64 (-1L));
assert_equal ~printer:Int64.to_string (-7296932117151183542L)
(H.hash_int64 1234567890123456789L);
true
;;
(* Gold tests: hash_int32 *)
t @@ fun () ->
assert_equal ~printer:Int64.to_string 4246796580750024372L (H.hash_int32 0l);
assert_equal ~printer:Int64.to_string (-851299076295404719L) (H.hash_int32 1l);
assert_equal ~printer:Int64.to_string 9185342943168159635L (H.hash_int32 (-1l));
assert_equal ~printer:Int64.to_string (-2929917330072466447L) (H.hash_int32 42l);
true
;;
(* Gold tests: hash_int *)
t @@ fun () ->
assert_equal ~printer:Int64.to_string 3803688792395291579L (H.hash_int 0);
assert_equal ~printer:Int64.to_string (-6977822845260490347L) (H.hash_int 1);
assert_equal ~printer:Int64.to_string (-8804195676797548855L) (H.hash_int (-1));
assert_equal ~printer:Int64.to_string (-5379971487550586029L) (H.hash_int 42);
true
;;
(* Gold tests: hash_bool *)
t @@ fun () ->
assert_equal ~printer:Int64.to_string 3803688792395291579L (H.hash_bool false);
assert_equal ~printer:Int64.to_string (-6977822845260490347L) (H.hash_bool true);
true
;;
(* Gold tests: hash_char *)
t @@ fun () ->
(* 'a' = 97, '0' = 48 *)
assert_equal ~printer:Int64.to_string (H.hash_int 97) (H.hash_char 'a');
assert_equal ~printer:Int64.to_string (H.hash_int 48) (H.hash_char '0');
true
;;
(* Property tests: determinism *)
q ~count:10_000 Q.string @@ fun s ->
Int64.equal (H.hash_string s) (H.hash_string s)
;;
q ~count:10_000 Q.int64 @@ fun v ->
Int64.equal (H.hash_int64 v) (H.hash_int64 v)
;;
q ~count:10_000 Q.int @@ fun v -> Int64.equal (H.hash_int v) (H.hash_int v);;
q ~count:10_000 Q.bool @@ fun b -> Int64.equal (H.hash_bool b) (H.hash_bool b);;
q ~count:10_000 Q.char @@ fun c -> Int64.equal (H.hash_char c) (H.hash_char c);;
(* Different seeds give different results for the same input *)
q ~count:10_000 (Q.pair Q.string Q.int64) @@ fun (s, seed) ->
Q.assume (not (Int64.equal seed 0L));
not (Int64.equal (H.hash_string s) (H.hash_string_seed s seed))
;;
q ~count:10_000 (Q.pair Q.int64 Q.int64) @@ fun (v, seed) ->
Q.assume (not (Int64.equal seed 0L));
not (Int64.equal (H.hash_int64 v) (H.hash_int64_seed v seed))
;;
q ~count:10_000 (Q.pair Q.int Q.int64) @@ fun (v, seed) ->
Q.assume (not (Int64.equal seed 0L));
not (Int64.equal (H.hash_int v) (H.hash_int_seed v seed))
;;
(* Different inputs give different results for the same seed *)
q ~count:10_000 (Q.pair Q.string Q.string) @@ fun (s1, s2) ->
Q.assume (not (String.equal s1 s2));
not (Int64.equal (H.hash_string s1) (H.hash_string s2))
;;
q ~count:10_000 (Q.pair Q.int64 Q.int64) @@ fun (a, b) ->
Q.assume (not (Int64.equal a b));
not (Int64.equal (H.hash_int64 a) (H.hash_int64 b))
;;
q ~count:10_000 (Q.pair Q.int Q.int) @@ fun (a, b) ->
Q.assume (a <> b);
not (Int64.equal (H.hash_int a) (H.hash_int b))
;;
(* Stress test: hash 100k strings of varying lengths, non-empty => non-zero *)
t @@ fun () ->
for len = 0 to 99 do
for _ = 1 to 1000 do
let s = String.make len 'x' in
let h = H.hash_string s in
if len > 0 then
if Int64.equal h 0L then
failwith
(Printf.sprintf "unexpected zero hash for string of len %d" len)
done
done;
true
let () = Containers_testlib.run_all ~descr:"test xxhash" [ get () ]