mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-07 11:45:31 -05:00
new module CCBloom in containers.data, a bloom filter
This commit is contained in:
parent
369a13cea8
commit
ecbddc132b
6 changed files with 249 additions and 2 deletions
|
|
@ -118,6 +118,7 @@ Documentation [here](http://cedeela.fr/~simon/software/containers).
|
||||||
### Containers.data
|
### Containers.data
|
||||||
|
|
||||||
- `CCBitField`, bitfields embedded in integers
|
- `CCBitField`, bitfields embedded in integers
|
||||||
|
- `CCBloom`, a bloom filter
|
||||||
- `CCCache`, memoization caches, LRU, etc.
|
- `CCCache`, memoization caches, LRU, etc.
|
||||||
- `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation
|
- `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation
|
||||||
- `CCTrie`, a prefix tree
|
- `CCTrie`, a prefix tree
|
||||||
|
|
|
||||||
2
_oasis
2
_oasis
|
|
@ -85,7 +85,7 @@ Library "containers_data"
|
||||||
CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl,
|
CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl,
|
||||||
CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray,
|
CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray,
|
||||||
CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField,
|
CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField,
|
||||||
CCHashTrie
|
CCHashTrie, CCBloom
|
||||||
BuildDepends: bytes
|
BuildDepends: bytes
|
||||||
FindlibParent: containers
|
FindlibParent: containers
|
||||||
FindlibName: data
|
FindlibName: data
|
||||||
|
|
|
||||||
|
|
@ -64,6 +64,7 @@ Various data structures.
|
||||||
|
|
||||||
{!modules:
|
{!modules:
|
||||||
CCBitField
|
CCBitField
|
||||||
|
CCBloom
|
||||||
CCBV
|
CCBV
|
||||||
CCCache
|
CCCache
|
||||||
CCFQueue
|
CCFQueue
|
||||||
|
|
|
||||||
166
src/data/CCBloom.ml
Normal file
166
src/data/CCBloom.ml
Normal file
|
|
@ -0,0 +1,166 @@
|
||||||
|
|
||||||
|
(* This file is free software, part of containers. See file "license" for more details. *)
|
||||||
|
|
||||||
|
(** {1 Bloom Filter} *)
|
||||||
|
|
||||||
|
type 'a sequence = ('a -> unit) -> unit
|
||||||
|
type 'a gen = unit -> 'a option
|
||||||
|
|
||||||
|
type 'a hash_funs = ('a -> int) array
|
||||||
|
|
||||||
|
let primes_ = [|
|
||||||
|
2; 3; 5; 7; 11; 13; 17; 19; 23; 29; 31; 37; 41; 43; 47; 53; 59; 61; 67; 71;
|
||||||
|
73; 79; 83; 89; 97; 101; 103; 107; 109; 113; 127; 131; 137; 139;
|
||||||
|
149; 151; 157; 163; 167; 173
|
||||||
|
|]
|
||||||
|
|
||||||
|
let default_hash_funs k =
|
||||||
|
Array.init k
|
||||||
|
(fun i ->
|
||||||
|
let seed = if i<Array.length primes_ then primes_.(i) else i in
|
||||||
|
fun x -> Hashtbl.seeded_hash seed x
|
||||||
|
)
|
||||||
|
|
||||||
|
(** {2 Bloom Filter} *)
|
||||||
|
|
||||||
|
type 'a t = {
|
||||||
|
hash_funs : 'a hash_funs;
|
||||||
|
arr : Bytes.t;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mk_default_ size =
|
||||||
|
default_hash_funs (max 2 (size / 20))
|
||||||
|
|
||||||
|
let create ?hash size =
|
||||||
|
if size < 2 then invalid_arg "CCBloom.create";
|
||||||
|
let hash_funs = match hash with
|
||||||
|
| None -> mk_default_ size
|
||||||
|
| Some h -> h
|
||||||
|
in
|
||||||
|
let arr = Bytes.make size '\000' in
|
||||||
|
{ hash_funs; arr }
|
||||||
|
|
||||||
|
let create_default ?hash_len size =
|
||||||
|
let hash = match hash_len with
|
||||||
|
| None -> mk_default_ size
|
||||||
|
| Some n -> default_hash_funs n
|
||||||
|
in
|
||||||
|
create ~hash size
|
||||||
|
|
||||||
|
let copy f =
|
||||||
|
{f with arr= Bytes.copy f.arr }
|
||||||
|
|
||||||
|
let size f = 8 * Bytes.length f.arr
|
||||||
|
|
||||||
|
(* number of 1 bits in [c] *)
|
||||||
|
let rec popcount_byte_ c =
|
||||||
|
if c=0 then 0
|
||||||
|
else
|
||||||
|
(c land 1) + popcount_byte_ (c lsr 1)
|
||||||
|
|
||||||
|
let () = assert (
|
||||||
|
popcount_byte_ 0 = 0 &&
|
||||||
|
popcount_byte_ 3 = 2 &&
|
||||||
|
popcount_byte_ 255 = 8
|
||||||
|
)
|
||||||
|
|
||||||
|
(* count the number of 1 bits *)
|
||||||
|
let rec count_ones_ arr i acc =
|
||||||
|
if i=Bytes.length arr then acc
|
||||||
|
else
|
||||||
|
let c = Char.code (Bytes.get arr i) in
|
||||||
|
count_ones_ arr (i+1) (acc + popcount_byte_ c)
|
||||||
|
|
||||||
|
let load f =
|
||||||
|
let ones = count_ones_ f.arr 0 0 in
|
||||||
|
float_of_int ones /. (float_of_int (Bytes.length f.arr * 8))
|
||||||
|
|
||||||
|
exception LocalExit
|
||||||
|
|
||||||
|
(* get i-th bit *)
|
||||||
|
let get_ arr i =
|
||||||
|
let j = i / 8 in
|
||||||
|
let c = Char.code (Bytes.get arr j) in
|
||||||
|
c land (1 lsl (i mod 8)) <> 0
|
||||||
|
|
||||||
|
(* set i-th bit *)
|
||||||
|
let set_ arr i =
|
||||||
|
let j = i / 8 in
|
||||||
|
let c = Char.code (Bytes.get arr j) in
|
||||||
|
let c = c lor (1 lsl (i mod 8)) in
|
||||||
|
Bytes.set arr j (Char.chr c)
|
||||||
|
|
||||||
|
let mem f x =
|
||||||
|
let n = size f in
|
||||||
|
try
|
||||||
|
Array.iter
|
||||||
|
(fun hash -> if not (get_ f.arr (hash x mod n)) then raise LocalExit)
|
||||||
|
f.hash_funs;
|
||||||
|
true
|
||||||
|
with LocalExit -> false
|
||||||
|
|
||||||
|
let add f x =
|
||||||
|
let n = size f in
|
||||||
|
Array.iter
|
||||||
|
(fun hash -> set_ f.arr (hash x mod n))
|
||||||
|
f.hash_funs
|
||||||
|
|
||||||
|
(*$Q
|
||||||
|
Q.(list int) (fun l -> \
|
||||||
|
let f = create 30 in add_list f l ; \
|
||||||
|
List.for_all (mem f) l)
|
||||||
|
*)
|
||||||
|
|
||||||
|
let union_mut ~into f =
|
||||||
|
if size into <> size f then invalid_arg "CCBloom.union_mut";
|
||||||
|
Bytes.iteri
|
||||||
|
(fun i c ->
|
||||||
|
Bytes.set into.arr i
|
||||||
|
(Char.chr (Char.code (Bytes.get into.arr i) lor (Char.code c)))
|
||||||
|
) f.arr
|
||||||
|
|
||||||
|
let union a b =
|
||||||
|
if size a <> size b then invalid_arg "CCBloom.union";
|
||||||
|
let into = copy a in
|
||||||
|
union_mut ~into b;
|
||||||
|
into
|
||||||
|
|
||||||
|
(*$Q
|
||||||
|
Q.(pair (list int)(list int)) (fun (l1,l2) -> \
|
||||||
|
let f1=create 100 and f2 = create 100 in \
|
||||||
|
add_list f1 l1; add_list f2 l2; \
|
||||||
|
let f = union f1 f2 in \
|
||||||
|
List.for_all (fun i -> not (mem f1 i) || mem f i) l1 && \
|
||||||
|
List.for_all (fun i -> not (mem f2 i) || mem f i) l2)
|
||||||
|
*)
|
||||||
|
|
||||||
|
let inter_mut ~into f =
|
||||||
|
if size into <> size f then invalid_arg "CCBloom.inter_mut";
|
||||||
|
Bytes.iteri
|
||||||
|
(fun i c ->
|
||||||
|
Bytes.set into.arr i
|
||||||
|
(Char.chr (Char.code (Bytes.get into.arr i) land (Char.code c)))
|
||||||
|
) f.arr
|
||||||
|
|
||||||
|
let inter a b =
|
||||||
|
if size a <> size b then invalid_arg "CCBloom.inter";
|
||||||
|
let into = copy a in
|
||||||
|
inter_mut ~into b;
|
||||||
|
into
|
||||||
|
|
||||||
|
(*$Q
|
||||||
|
Q.(pair (list int)(list int)) (fun (l1,l2) -> \
|
||||||
|
let f1=create 100 and f2 = create 100 in \
|
||||||
|
add_list f1 l1; add_list f2 l2; \
|
||||||
|
let f = inter f1 f2 in \
|
||||||
|
List.for_all (fun i -> not (mem f1 i) || not (mem f2 i) || mem f i) (l1@l2))
|
||||||
|
*)
|
||||||
|
|
||||||
|
let add_list f l = List.iter (add f) l
|
||||||
|
|
||||||
|
let add_seq f seq = seq (add f)
|
||||||
|
|
||||||
|
let rec add_gen f g = match g() with
|
||||||
|
| None -> ()
|
||||||
|
| Some x -> add f x; add_gen f g
|
||||||
|
|
||||||
79
src/data/CCBloom.mli
Normal file
79
src/data/CCBloom.mli
Normal file
|
|
@ -0,0 +1,79 @@
|
||||||
|
|
||||||
|
(* This file is free software, part of containers. See file "license" for more details. *)
|
||||||
|
|
||||||
|
(** {1 Bloom Filter}
|
||||||
|
|
||||||
|
{b status: experimental}
|
||||||
|
|
||||||
|
@since NEXT_RELEASE *)
|
||||||
|
|
||||||
|
type 'a sequence = ('a -> unit) -> unit
|
||||||
|
type 'a gen = unit -> 'a option
|
||||||
|
|
||||||
|
type 'a hash_funs = ('a -> int) array
|
||||||
|
(** An array of [k] hash functions on values of type ['a].
|
||||||
|
Never ever modify such an array after use! *)
|
||||||
|
|
||||||
|
val default_hash_funs : int -> 'a hash_funs
|
||||||
|
(** Use {!Hashtbl.seeded_hash} on [k] seeds
|
||||||
|
@param k the number of hash functions required *)
|
||||||
|
|
||||||
|
(** {2 Bloom Filter} *)
|
||||||
|
|
||||||
|
type 'a t
|
||||||
|
(** Bloom filter containing values of type ['a] *)
|
||||||
|
|
||||||
|
val create : ?hash:('a hash_funs) -> int -> 'a t
|
||||||
|
(** [create ?hash size] creates a filter with given size, and functions.
|
||||||
|
By default it uses {!default_hash_funs}
|
||||||
|
@param size a hint for size *)
|
||||||
|
|
||||||
|
val create_default : ?hash_len:int -> int -> 'a t
|
||||||
|
(** [create_default ?hash_len size] is the same as
|
||||||
|
[create ~hash:(default_hash_funs hash_len) size].
|
||||||
|
It uses the given number of default hash functions.
|
||||||
|
@param size a hint for size *)
|
||||||
|
|
||||||
|
val copy : 'a t -> 'a t
|
||||||
|
(** Copy of the filter *)
|
||||||
|
|
||||||
|
val size : _ t -> int
|
||||||
|
(** Length of the underlying array. Do not confuse with a cardinal function,
|
||||||
|
which is impossible to write for bloom filters *)
|
||||||
|
|
||||||
|
val load : _ t -> float
|
||||||
|
(** Ratio of 1 bits in the underlying array. The closer to [1.], the less
|
||||||
|
accurate {!mem} is *)
|
||||||
|
|
||||||
|
val mem : 'a t -> 'a -> bool
|
||||||
|
(** [mem f x] tests whether [x] (probably) belongs in [f] *)
|
||||||
|
|
||||||
|
val add : 'a t -> 'a -> unit
|
||||||
|
(** [add f x] adds [x] into [f] *)
|
||||||
|
|
||||||
|
val union_mut : into:'a t -> 'a t -> unit
|
||||||
|
(** [union_mut ~into f] changes [into] into the union of [into] and [f].
|
||||||
|
[into] and [f] MUST have the same set of hash functions
|
||||||
|
@raise Invalid_argument if the two sets do not have the same size *)
|
||||||
|
|
||||||
|
val union : 'a t -> 'a t -> 'a t
|
||||||
|
(** the sets MUST have the same set of hash functions
|
||||||
|
@raise Invalid_argument if the two sets do not have the same size *)
|
||||||
|
|
||||||
|
val inter_mut : into:'a t -> 'a t -> unit
|
||||||
|
(** [inter_mut ~into f] changes [into] into the intersection of [into] and [f]
|
||||||
|
[into] and [f] MUST have the same set of hash functions
|
||||||
|
@raise Invalid_argument if the two sets do not have the same size *)
|
||||||
|
|
||||||
|
val inter : 'a t -> 'a t -> 'a t
|
||||||
|
(** the sets MUST have the same set of hash functions
|
||||||
|
@raise Invalid_argument if the two sets do not have the same size *)
|
||||||
|
|
||||||
|
(** {2 Conversions} *)
|
||||||
|
|
||||||
|
val add_list : 'a t -> 'a list -> unit
|
||||||
|
|
||||||
|
val add_seq : 'a t -> 'a sequence -> unit
|
||||||
|
|
||||||
|
val add_gen : 'a t -> 'a gen -> unit
|
||||||
|
|
||||||
|
|
@ -11,7 +11,7 @@
|
||||||
Preliminary benchmarks (see the "tbl" section of benchmarks) tend to show
|
Preliminary benchmarks (see the "tbl" section of benchmarks) tend to show
|
||||||
that this type is quite efficient for small data sets.
|
that this type is quite efficient for small data sets.
|
||||||
|
|
||||||
{b status: experimental}
|
{b status: unstable}
|
||||||
|
|
||||||
@since NEXT_RELEASE
|
@since NEXT_RELEASE
|
||||||
*)
|
*)
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue