mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
more tests, bugfixes, and benchs for KMP in CCString
This commit is contained in:
parent
fb8661d1ba
commit
ab0b198f97
3 changed files with 57 additions and 36 deletions
|
|
@ -1156,13 +1156,12 @@ end
|
|||
|
||||
module Str = struct
|
||||
(* random string, but always returns the same for a given size *)
|
||||
let rand_str_ n =
|
||||
let rand_str_ ?(among="abcdefgh") n =
|
||||
let module Q = Quickcheck in
|
||||
let st = Random.State.make [| n |] in
|
||||
let gen_c = Q.Gen.oneofl (CCString.to_list "abcdefghijkl") in
|
||||
let gen_c = Q.Gen.oneofl (CCString.to_list among) in
|
||||
Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st
|
||||
|
||||
(* note: inefficient *)
|
||||
let find ?(start=0) ~sub s =
|
||||
let n = String.length sub in
|
||||
let i = ref start in
|
||||
|
|
@ -1175,7 +1174,6 @@ module Str = struct
|
|||
with Exit ->
|
||||
!i
|
||||
|
||||
(* note: inefficient *)
|
||||
let rfind ~sub s =
|
||||
let n = String.length sub in
|
||||
let i = ref (String.length s - n) in
|
||||
|
|
@ -1235,6 +1233,19 @@ module Str = struct
|
|||
; "current", mk_current, ()
|
||||
]
|
||||
|
||||
(* benchmark String.find_all on constant strings *)
|
||||
let bench_find_all_special ~size n =
|
||||
let needle = CCString.repeat "a" (size-1) ^ "b" in
|
||||
let haystack = rand_str_ ~among:"ab" n in
|
||||
pp_pb needle haystack;
|
||||
let mk_naive () = find_all_l ~sub:needle haystack
|
||||
and mk_current () = CCString.find_all_l ~sub:needle haystack in
|
||||
assert (mk_naive () = mk_current ());
|
||||
B.throughputN 3 ~repeat
|
||||
[ "naive", mk_naive, ()
|
||||
; "current", mk_current, ()
|
||||
]
|
||||
|
||||
let bench_find = bench_find_ ~dir:`Direct
|
||||
let bench_rfind = bench_find_ ~dir:`Reverse
|
||||
|
||||
|
|
@ -1242,6 +1253,7 @@ module Str = struct
|
|||
"string" @>>>
|
||||
[ "find" @>>>
|
||||
[ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000]
|
||||
; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
|
||||
; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000]
|
||||
; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000]
|
||||
; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000]
|
||||
|
|
@ -1249,13 +1261,16 @@ module Str = struct
|
|||
];
|
||||
"find_all" @>>>
|
||||
[ "1" @>> app_ints (bench_find_all ~size:1) [100; 100_000; 500_000]
|
||||
; "3" @>> app_ints (bench_find_all ~size:3) [100; 100_000; 500_000]
|
||||
; "5" @>> app_ints (bench_find_all ~size:5) [100; 100_000; 500_000]
|
||||
; "15" @>> app_ints (bench_find_all ~size:15) [100; 100_000; 500_000]
|
||||
; "50" @>> app_ints (bench_find_all ~size:50) [100; 100_000; 500_000]
|
||||
; "500" @>> app_ints (bench_find_all ~size:500) [100_000; 500_000]
|
||||
; "special" @>> app_ints (bench_find_all_special ~size:6) [100_000; 500_000]
|
||||
];
|
||||
"rfind" @>>>
|
||||
[ "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000]
|
||||
[ "3" @>> app_ints (bench_rfind ~size:3) [100; 100_000; 500_000]
|
||||
; "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000]
|
||||
; "50" @>> app_ints (bench_rfind ~size:50) [100; 100_000; 500_000]
|
||||
; "500" @>> app_ints (bench_rfind ~size:500) [100_000; 500_000]
|
||||
];
|
||||
|
|
|
|||
|
|
@ -74,6 +74,7 @@ type _ direction =
|
|||
| Direct : [`Direct] direction
|
||||
| Reverse : [`Reverse] direction
|
||||
|
||||
(* we follow https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm *)
|
||||
module KMP = struct
|
||||
type 'a pattern = {
|
||||
failure : int array;
|
||||
|
|
@ -88,7 +89,7 @@ module KMP = struct
|
|||
let get_
|
||||
: type a. dir:a direction -> string -> int -> char
|
||||
= fun ~dir -> match dir with
|
||||
| Direct -> (fun s i -> s.[i])
|
||||
| Direct -> String.get
|
||||
| Reverse -> (fun s i -> s.[String.length s - i - 1])
|
||||
|
||||
let compile_
|
||||
|
|
@ -102,26 +103,28 @@ module KMP = struct
|
|||
| _ ->
|
||||
(* at least 2 elements, the algorithm can work *)
|
||||
let failure = Array.make len 0 in
|
||||
failure.(0) <- -1;
|
||||
(* i: current index in str *)
|
||||
let i = ref 1 in
|
||||
let i = ref 2 in
|
||||
(* j: index of candidate substring *)
|
||||
let j = ref 0 in
|
||||
while !i < len-1 do
|
||||
while !i < len do
|
||||
match !j with
|
||||
| _ when get str !i = get str !j ->
|
||||
| _ when get str (!i-1) = get str !j ->
|
||||
(* substring starting at !j continues matching current char *)
|
||||
i := !i+1;
|
||||
j := !j+1;
|
||||
incr j;
|
||||
failure.(!i) <- !j;
|
||||
incr i;
|
||||
| 0 ->
|
||||
(* back to the beginning *)
|
||||
i := !i+1;
|
||||
failure.(!i) <- 0;
|
||||
incr i;
|
||||
| _ ->
|
||||
(* fallback for the prefix string *)
|
||||
assert (!j > 0);
|
||||
j := failure.(!j)
|
||||
done;
|
||||
(* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *)
|
||||
{ failure; str; }
|
||||
|
||||
let compile s = compile_ ~dir:Direct s
|
||||
|
|
@ -139,23 +142,30 @@ module KMP = struct
|
|||
let i = ref idx in
|
||||
let j = ref 0 in
|
||||
let pat_len = pattern_length pattern in
|
||||
while !i < len && !j < pat_len do
|
||||
let c = get s !i in
|
||||
while !j < pat_len && !i + !j < len do
|
||||
let c = get s (!i + !j) in
|
||||
let expected = get pattern.str !j in
|
||||
if c = expected
|
||||
then (
|
||||
(* char matches *)
|
||||
i := !i + 1; j := !j + 1
|
||||
incr j;
|
||||
) else (
|
||||
if !j=0
|
||||
then (* beginning of the pattern *)
|
||||
i := !i + 1
|
||||
else (* follow the failure link *)
|
||||
j := pattern.failure.(!j)
|
||||
let fail_offset = pattern.failure.(!j) in
|
||||
if fail_offset >= 0
|
||||
then (
|
||||
assert (fail_offset < !j);
|
||||
(* follow the failure link *)
|
||||
i := !i + !j - fail_offset;
|
||||
j := fail_offset
|
||||
) else (
|
||||
(* beginning of pattern *)
|
||||
j := 0;
|
||||
incr i
|
||||
)
|
||||
)
|
||||
done;
|
||||
if !j = pat_len
|
||||
then !i - pat_len
|
||||
then !i
|
||||
else -1
|
||||
|
||||
let find ~pattern s i = find_ ~dir:Direct ~pattern s i
|
||||
|
|
@ -242,29 +252,20 @@ module Split = struct
|
|||
| SplitStop
|
||||
| SplitAt of int (* previous *)
|
||||
|
||||
(* [by_j... prefix of s_i...] ? *)
|
||||
let rec _is_prefix ~by s i j =
|
||||
j = String.length by
|
||||
||
|
||||
( i < String.length s &&
|
||||
s.[i] = by.[j] &&
|
||||
_is_prefix ~by s (i+1) (j+1)
|
||||
)
|
||||
|
||||
let rec _split ~by s state = match state with
|
||||
| SplitStop -> None
|
||||
| SplitAt prev -> _split_search ~by s prev prev
|
||||
and _split_search ~by s prev i =
|
||||
if i >= String.length s
|
||||
| SplitAt prev -> _split_search ~by s prev
|
||||
and _split_search ~by s prev =
|
||||
let j = KMP.find ~pattern:by s prev in
|
||||
if j < 0
|
||||
then Some (SplitStop, prev, String.length s - prev)
|
||||
else if _is_prefix ~by s i 0
|
||||
then Some (SplitAt (i+String.length by), prev, i-prev)
|
||||
else _split_search ~by s prev (i+1)
|
||||
else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev)
|
||||
|
||||
let _tuple3 x y z = x,y,z
|
||||
|
||||
let _mkgen ~by s k =
|
||||
let state = ref (SplitAt 0) in
|
||||
let by = KMP.compile by in
|
||||
fun () ->
|
||||
match _split ~by s !state with
|
||||
| None -> None
|
||||
|
|
@ -277,6 +278,7 @@ module Split = struct
|
|||
let gen_cpy ~by s = _mkgen ~by s String.sub
|
||||
|
||||
let _mklist ~by s k =
|
||||
let by = KMP.compile by in
|
||||
let rec build acc state = match _split ~by s state with
|
||||
| None -> List.rev acc
|
||||
| Some (state', i, len) ->
|
||||
|
|
@ -289,6 +291,7 @@ module Split = struct
|
|||
let list_cpy ~by s = _mklist ~by s String.sub
|
||||
|
||||
let _mkklist ~by s k =
|
||||
let by = KMP.compile by in
|
||||
let rec make state () = match _split ~by s state with
|
||||
| None -> `Nil
|
||||
| Some (state', i, len) ->
|
||||
|
|
@ -300,6 +303,7 @@ module Split = struct
|
|||
let klist_cpy ~by s = _mkklist ~by s String.sub
|
||||
|
||||
let _mkseq ~by s f k =
|
||||
let by = KMP.compile by in
|
||||
let rec aux state = match _split ~by s state with
|
||||
| None -> ()
|
||||
| Some (state', i, len) -> k (f s i len); aux state'
|
||||
|
|
|
|||
|
|
@ -122,6 +122,8 @@ val find_all_l : ?start:int -> sub:string -> string -> int list
|
|||
(*$= & ~printer:Q.Print.(list int)
|
||||
[1; 6] (find_all_l ~sub:"bc" "abc aabc aab")
|
||||
[] (find_all_l ~sub:"bc" "abd")
|
||||
[76] (find_all_l ~sub:"aaaaaa" \
|
||||
"aabbaabbaaaaabbbbabababababbbbabbbabbaaababbbaaabaabbaabbaaaabbababaaaabbaabaaaaaabbbaaaabababaabaaabbaabaaaabbababbaabbaaabaabbabababbbaabababaaabaaababbbaaaabbbaabaaababbabaababbaabbaaaaabababbabaababbbaaabbabbabababaaaabaaababaaaaabbabbaabbabbbbbbbbbbbbbbaabbabbbbbabbaaabbabbbbabaaaaabbababbbaaaa")
|
||||
*)
|
||||
|
||||
val mem : ?start:int -> sub:string -> string -> bool
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue