new module CCBloom in containers.data, a bloom filter

2026-01-29 04:14:51 -05:00 · 2015-09-06 21:44:55 +02:00 · 2015-09-06 21:44:55 +02:00 · ecbddc132b
commit ecbddc132b
parent 369a13cea8
6 changed files with 249 additions and 2 deletions
--- a/README.md
+++ b/README.md
@ -118,6 +118,7 @@ Documentation [here](http://cedeela.fr/~simon/software/containers).
 ### Containers.data
 - `CCBitField`, bitfields embedded in integers
 - `CCBloom`, a bloom filter
 - `CCCache`, memoization caches, LRU, etc.
 - `CCFlatHashtbl`, a flat (open-addressing) hashtable functorial implementation
 - `CCTrie`, a prefix tree
--- a/2
+++ b/2
@ -85,7 +85,7 @@ Library "containers_data"
                    CCPersistentHashtbl, CCDeque, CCFQueue, CCBV, CCMixtbl,
                    CCMixmap, CCRingBuffer, CCIntMap, CCPersistentArray,
                    CCMixset, CCHashconsedSet, CCGraph, CCHashSet, CCBitField,
-                    CCHashTrie
+                    CCHashTrie, CCBloom
  BuildDepends:     bytes
  FindlibParent:    containers
  FindlibName:      data
--- a/doc/intro.txt
+++ b/doc/intro.txt
@ -64,6 +64,7 @@ Various data structures.
 {!modules:
 CCBitField
 CCBloom
 CCBV
 CCCache
 CCFQueue
--- a/src/data/CCBloom.ml
+++ b/src/data/CCBloom.ml
@ -0,0 +1,166 @@
 (* This file is free software, part of containers. See file "license" for more details. *)
 (** {1 Bloom Filter} *)
 type 'a sequence = ('a -> unit) -> unit
 type 'a gen = unit -> 'a option
 type 'a hash_funs = ('a -> int) array
 let primes_ = [|
  2; 3; 5; 7; 11; 13; 17; 19; 23; 29; 31; 37; 41; 43; 47; 53; 59; 61; 67; 71;
  73; 79; 83; 89; 97; 101; 103; 107; 109; 113; 127; 131; 137; 139;
  149; 151; 157; 163; 167; 173
 |]
 let default_hash_funs k =
  Array.init k
    (fun i ->
      let seed = if i<Array.length primes_ then primes_.(i) else i in
      fun x -> Hashtbl.seeded_hash seed x
    )
 (** {2 Bloom Filter} *)
 type 'a t = {
  hash_funs : 'a hash_funs;
  arr : Bytes.t;
 }
 let mk_default_ size =
  default_hash_funs (max 2 (size / 20))
 let create ?hash size =
  if size < 2 then invalid_arg "CCBloom.create";
  let hash_funs = match hash with
    | None -> mk_default_ size
    | Some h -> h
  in
  let arr = Bytes.make size '\000' in
  { hash_funs; arr }
 let create_default ?hash_len size =
  let hash = match hash_len with
    | None -> mk_default_ size
    | Some n -> default_hash_funs n
  in
  create ~hash size
 let copy f =
  {f with arr= Bytes.copy f.arr }
 let size f = 8 * Bytes.length f.arr
 (* number of 1 bits in [c] *)
 let rec popcount_byte_ c =
  if c=0 then 0
  else
    (c land 1) + popcount_byte_ (c lsr 1)
 let () = assert (
  popcount_byte_ 0 = 0 &&
  popcount_byte_ 3 = 2 &&
  popcount_byte_ 255 = 8
 )
 (* count the number of 1 bits *)
 let rec count_ones_ arr i acc =
  if i=Bytes.length arr then acc
  else
    let c = Char.code (Bytes.get arr i) in
    count_ones_ arr (i+1) (acc + popcount_byte_ c)
 let load f =
  let ones = count_ones_ f.arr 0 0 in
  float_of_int ones /. (float_of_int (Bytes.length f.arr * 8))
 exception LocalExit
 (* get i-th bit *)
 let get_ arr i =
  let j = i / 8 in
  let c = Char.code (Bytes.get arr j) in
  c land (1 lsl (i mod 8)) <> 0
 (* set i-th bit *)
 let set_ arr i =
  let j = i / 8 in
  let c = Char.code (Bytes.get arr j) in
  let c = c lor (1 lsl (i mod 8)) in
  Bytes.set arr j (Char.chr c)
 let mem f x =
  let n = size f in
  try
    Array.iter
      (fun hash -> if not (get_ f.arr (hash x mod n)) then raise LocalExit)
      f.hash_funs;
    true
  with LocalExit -> false
 let add f x =
  let n = size f in
  Array.iter
    (fun hash -> set_ f.arr (hash x mod n))
    f.hash_funs
 (*$Q
  Q.(list int) (fun l -> \
    let f = create 30 in add_list f l ; \
    List.for_all (mem f) l)
 *)
 let union_mut ~into f =
  if size into <> size f then invalid_arg "CCBloom.union_mut";
  Bytes.iteri
    (fun i c ->
      Bytes.set into.arr i
        (Char.chr (Char.code (Bytes.get into.arr i) lor (Char.code c)))
    ) f.arr
 let union a b =
  if size a <> size b then invalid_arg "CCBloom.union";
  let into = copy a in
  union_mut ~into b;
  into
 (*$Q
  Q.(pair (list int)(list int)) (fun (l1,l2) -> \
    let f1=create 100 and f2 = create 100 in \
    add_list f1 l1; add_list f2 l2; \
    let f = union f1 f2 in \
    List.for_all (fun i -> not (mem f1 i) || mem f i) l1 && \
    List.for_all (fun i -> not (mem f2 i) || mem f i) l2)
 *)
 let inter_mut ~into f =
  if size into <> size f then invalid_arg "CCBloom.inter_mut";
  Bytes.iteri
    (fun i c ->
      Bytes.set into.arr i
        (Char.chr (Char.code (Bytes.get into.arr i) land (Char.code c)))
    ) f.arr
 let inter a b =
  if size a <> size b then invalid_arg "CCBloom.inter";
  let into = copy a in
  inter_mut ~into b;
  into
 (*$Q
  Q.(pair (list int)(list int)) (fun (l1,l2) -> \
    let f1=create 100 and f2 = create 100 in \
    add_list f1 l1; add_list f2 l2; \
    let f = inter f1 f2 in \
    List.for_all (fun i -> not (mem f1 i) || not (mem f2 i) || mem f i) (l1@l2))
 *)
 let add_list f l = List.iter (add f) l
 let add_seq f seq = seq (add f)
 let rec add_gen f g = match g() with
  | None -> ()
  | Some x -> add f x; add_gen f g
--- a/src/data/CCBloom.mli
+++ b/src/data/CCBloom.mli
@ -0,0 +1,79 @@
 (* This file is free software, part of containers. See file "license" for more details. *)
 (** {1 Bloom Filter}
    {b status: experimental}
    @since NEXT_RELEASE *)
 type 'a sequence = ('a -> unit) -> unit
 type 'a gen = unit -> 'a option
 type 'a hash_funs = ('a -> int) array
 (** An array of [k] hash functions on values of type ['a].
    Never ever modify such an array after use! *)
 val default_hash_funs : int -> 'a hash_funs
 (** Use {!Hashtbl.seeded_hash} on [k] seeds
    @param k the number of hash functions required *)
 (** {2 Bloom Filter} *)
 type 'a t
 (** Bloom filter containing values of type ['a] *)
 val create : ?hash:('a hash_funs) -> int -> 'a t
 (** [create ?hash size] creates a filter with given size, and functions.
    By default it uses {!default_hash_funs}
    @param size a hint for size *)
 val create_default : ?hash_len:int -> int -> 'a t
 (** [create_default ?hash_len size] is the same as
    [create ~hash:(default_hash_funs hash_len) size].
    It uses the given number of default hash functions.
    @param size a hint for size *)
 val copy : 'a t -> 'a t
 (** Copy of the filter *)
 val size : _ t -> int
 (** Length of the underlying array. Do not confuse with a cardinal function,
    which is impossible to write for bloom filters *)
 val load : _ t -> float
 (** Ratio of 1 bits in the underlying array. The closer to [1.], the less
    accurate {!mem} is *)
 val mem : 'a t -> 'a -> bool
 (** [mem f x] tests whether [x] (probably) belongs in [f] *)
 val add : 'a t -> 'a -> unit
 (** [add f x] adds [x] into [f] *)
 val union_mut : into:'a t -> 'a t -> unit
 (** [union_mut ~into f] changes [into] into the union of [into] and [f].
    [into] and [f] MUST have the same set of hash functions
    @raise Invalid_argument if the two sets do not have the same size *)
 val union : 'a t -> 'a t -> 'a t
 (** the sets MUST have the same set of hash functions
    @raise Invalid_argument if the two sets do not have the same size *)
 val inter_mut : into:'a t -> 'a t -> unit
 (** [inter_mut ~into f] changes [into] into the intersection of [into] and [f]
    [into] and [f] MUST have the same set of hash functions
    @raise Invalid_argument if the two sets do not have the same size *)
 val inter : 'a t -> 'a t -> 'a t
 (** the sets MUST have the same set of hash functions
    @raise Invalid_argument if the two sets do not have the same size *)
 (** {2 Conversions} *)
 val add_list : 'a t -> 'a list -> unit
 val add_seq : 'a t -> 'a sequence -> unit
 val add_gen : 'a t -> 'a gen -> unit
--- a/src/data/CCHashTrie.mli
+++ b/src/data/CCHashTrie.mli
@ -11,7 +11,7 @@
  Preliminary benchmarks (see the "tbl" section of benchmarks) tend to show
  that this type is quite efficient for small data sets.
-  {b status: experimental}
+  {b status: unstable}
  @since NEXT_RELEASE
 *)