more tests, bugfixes, and benchs for KMP in CCString

This commit is contained in:
Simon Cruanes 2016-03-09 22:24:57 +01:00
parent fb8661d1ba
commit ab0b198f97
3 changed files with 57 additions and 36 deletions

View file

@ -1156,13 +1156,12 @@ end
module Str = struct module Str = struct
(* random string, but always returns the same for a given size *) (* random string, but always returns the same for a given size *)
let rand_str_ n = let rand_str_ ?(among="abcdefgh") n =
let module Q = Quickcheck in let module Q = Quickcheck in
let st = Random.State.make [| n |] in let st = Random.State.make [| n |] in
let gen_c = Q.Gen.oneofl (CCString.to_list "abcdefghijkl") in let gen_c = Q.Gen.oneofl (CCString.to_list among) in
Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st
(* note: inefficient *)
let find ?(start=0) ~sub s = let find ?(start=0) ~sub s =
let n = String.length sub in let n = String.length sub in
let i = ref start in let i = ref start in
@ -1175,7 +1174,6 @@ module Str = struct
with Exit -> with Exit ->
!i !i
(* note: inefficient *)
let rfind ~sub s = let rfind ~sub s =
let n = String.length sub in let n = String.length sub in
let i = ref (String.length s - n) in let i = ref (String.length s - n) in
@ -1235,6 +1233,19 @@ module Str = struct
; "current", mk_current, () ; "current", mk_current, ()
] ]
(* benchmark String.find_all on constant strings *)
let bench_find_all_special ~size n =
let needle = CCString.repeat "a" (size-1) ^ "b" in
let haystack = rand_str_ ~among:"ab" n in
pp_pb needle haystack;
let mk_naive () = find_all_l ~sub:needle haystack
and mk_current () = CCString.find_all_l ~sub:needle haystack in
assert (mk_naive () = mk_current ());
B.throughputN 3 ~repeat
[ "naive", mk_naive, ()
; "current", mk_current, ()
]
let bench_find = bench_find_ ~dir:`Direct let bench_find = bench_find_ ~dir:`Direct
let bench_rfind = bench_find_ ~dir:`Reverse let bench_rfind = bench_find_ ~dir:`Reverse
@ -1242,6 +1253,7 @@ module Str = struct
"string" @>>> "string" @>>>
[ "find" @>>> [ "find" @>>>
[ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000] [ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000]
; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000]
; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000] ; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000]
; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000] ; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000]
; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000]
@ -1249,13 +1261,16 @@ module Str = struct
]; ];
"find_all" @>>> "find_all" @>>>
[ "1" @>> app_ints (bench_find_all ~size:1) [100; 100_000; 500_000] [ "1" @>> app_ints (bench_find_all ~size:1) [100; 100_000; 500_000]
; "3" @>> app_ints (bench_find_all ~size:3) [100; 100_000; 500_000]
; "5" @>> app_ints (bench_find_all ~size:5) [100; 100_000; 500_000] ; "5" @>> app_ints (bench_find_all ~size:5) [100; 100_000; 500_000]
; "15" @>> app_ints (bench_find_all ~size:15) [100; 100_000; 500_000] ; "15" @>> app_ints (bench_find_all ~size:15) [100; 100_000; 500_000]
; "50" @>> app_ints (bench_find_all ~size:50) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_find_all ~size:50) [100; 100_000; 500_000]
; "500" @>> app_ints (bench_find_all ~size:500) [100_000; 500_000] ; "500" @>> app_ints (bench_find_all ~size:500) [100_000; 500_000]
; "special" @>> app_ints (bench_find_all_special ~size:6) [100_000; 500_000]
]; ];
"rfind" @>>> "rfind" @>>>
[ "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000] [ "3" @>> app_ints (bench_rfind ~size:3) [100; 100_000; 500_000]
; "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000]
; "50" @>> app_ints (bench_rfind ~size:50) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_rfind ~size:50) [100; 100_000; 500_000]
; "500" @>> app_ints (bench_rfind ~size:500) [100_000; 500_000] ; "500" @>> app_ints (bench_rfind ~size:500) [100_000; 500_000]
]; ];

View file

@ -74,6 +74,7 @@ type _ direction =
| Direct : [`Direct] direction | Direct : [`Direct] direction
| Reverse : [`Reverse] direction | Reverse : [`Reverse] direction
(* we follow https://en.wikipedia.org/wiki/KnuthMorrisPratt_algorithm *)
module KMP = struct module KMP = struct
type 'a pattern = { type 'a pattern = {
failure : int array; failure : int array;
@ -88,7 +89,7 @@ module KMP = struct
let get_ let get_
: type a. dir:a direction -> string -> int -> char : type a. dir:a direction -> string -> int -> char
= fun ~dir -> match dir with = fun ~dir -> match dir with
| Direct -> (fun s i -> s.[i]) | Direct -> String.get
| Reverse -> (fun s i -> s.[String.length s - i - 1]) | Reverse -> (fun s i -> s.[String.length s - i - 1])
let compile_ let compile_
@ -102,26 +103,28 @@ module KMP = struct
| _ -> | _ ->
(* at least 2 elements, the algorithm can work *) (* at least 2 elements, the algorithm can work *)
let failure = Array.make len 0 in let failure = Array.make len 0 in
failure.(0) <- -1;
(* i: current index in str *) (* i: current index in str *)
let i = ref 1 in let i = ref 2 in
(* j: index of candidate substring *) (* j: index of candidate substring *)
let j = ref 0 in let j = ref 0 in
while !i < len-1 do while !i < len do
match !j with match !j with
| _ when get str !i = get str !j -> | _ when get str (!i-1) = get str !j ->
(* substring starting at !j continues matching current char *) (* substring starting at !j continues matching current char *)
i := !i+1; incr j;
j := !j+1;
failure.(!i) <- !j; failure.(!i) <- !j;
incr i;
| 0 -> | 0 ->
(* back to the beginning *) (* back to the beginning *)
i := !i+1;
failure.(!i) <- 0; failure.(!i) <- 0;
incr i;
| _ -> | _ ->
(* fallback for the prefix string *) (* fallback for the prefix string *)
assert (!j > 0); assert (!j > 0);
j := failure.(!j) j := failure.(!j)
done; done;
(* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *)
{ failure; str; } { failure; str; }
let compile s = compile_ ~dir:Direct s let compile s = compile_ ~dir:Direct s
@ -139,23 +142,30 @@ module KMP = struct
let i = ref idx in let i = ref idx in
let j = ref 0 in let j = ref 0 in
let pat_len = pattern_length pattern in let pat_len = pattern_length pattern in
while !i < len && !j < pat_len do while !j < pat_len && !i + !j < len do
let c = get s !i in let c = get s (!i + !j) in
let expected = get pattern.str !j in let expected = get pattern.str !j in
if c = expected if c = expected
then ( then (
(* char matches *) (* char matches *)
i := !i + 1; j := !j + 1 incr j;
) else ( ) else (
if !j=0 let fail_offset = pattern.failure.(!j) in
then (* beginning of the pattern *) if fail_offset >= 0
i := !i + 1 then (
else (* follow the failure link *) assert (fail_offset < !j);
j := pattern.failure.(!j) (* follow the failure link *)
i := !i + !j - fail_offset;
j := fail_offset
) else (
(* beginning of pattern *)
j := 0;
incr i
)
) )
done; done;
if !j = pat_len if !j = pat_len
then !i - pat_len then !i
else -1 else -1
let find ~pattern s i = find_ ~dir:Direct ~pattern s i let find ~pattern s i = find_ ~dir:Direct ~pattern s i
@ -242,29 +252,20 @@ module Split = struct
| SplitStop | SplitStop
| SplitAt of int (* previous *) | SplitAt of int (* previous *)
(* [by_j... prefix of s_i...] ? *)
let rec _is_prefix ~by s i j =
j = String.length by
||
( i < String.length s &&
s.[i] = by.[j] &&
_is_prefix ~by s (i+1) (j+1)
)
let rec _split ~by s state = match state with let rec _split ~by s state = match state with
| SplitStop -> None | SplitStop -> None
| SplitAt prev -> _split_search ~by s prev prev | SplitAt prev -> _split_search ~by s prev
and _split_search ~by s prev i = and _split_search ~by s prev =
if i >= String.length s let j = KMP.find ~pattern:by s prev in
if j < 0
then Some (SplitStop, prev, String.length s - prev) then Some (SplitStop, prev, String.length s - prev)
else if _is_prefix ~by s i 0 else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev)
then Some (SplitAt (i+String.length by), prev, i-prev)
else _split_search ~by s prev (i+1)
let _tuple3 x y z = x,y,z let _tuple3 x y z = x,y,z
let _mkgen ~by s k = let _mkgen ~by s k =
let state = ref (SplitAt 0) in let state = ref (SplitAt 0) in
let by = KMP.compile by in
fun () -> fun () ->
match _split ~by s !state with match _split ~by s !state with
| None -> None | None -> None
@ -277,6 +278,7 @@ module Split = struct
let gen_cpy ~by s = _mkgen ~by s String.sub let gen_cpy ~by s = _mkgen ~by s String.sub
let _mklist ~by s k = let _mklist ~by s k =
let by = KMP.compile by in
let rec build acc state = match _split ~by s state with let rec build acc state = match _split ~by s state with
| None -> List.rev acc | None -> List.rev acc
| Some (state', i, len) -> | Some (state', i, len) ->
@ -289,6 +291,7 @@ module Split = struct
let list_cpy ~by s = _mklist ~by s String.sub let list_cpy ~by s = _mklist ~by s String.sub
let _mkklist ~by s k = let _mkklist ~by s k =
let by = KMP.compile by in
let rec make state () = match _split ~by s state with let rec make state () = match _split ~by s state with
| None -> `Nil | None -> `Nil
| Some (state', i, len) -> | Some (state', i, len) ->
@ -300,6 +303,7 @@ module Split = struct
let klist_cpy ~by s = _mkklist ~by s String.sub let klist_cpy ~by s = _mkklist ~by s String.sub
let _mkseq ~by s f k = let _mkseq ~by s f k =
let by = KMP.compile by in
let rec aux state = match _split ~by s state with let rec aux state = match _split ~by s state with
| None -> () | None -> ()
| Some (state', i, len) -> k (f s i len); aux state' | Some (state', i, len) -> k (f s i len); aux state'

View file

@ -122,6 +122,8 @@ val find_all_l : ?start:int -> sub:string -> string -> int list
(*$= & ~printer:Q.Print.(list int) (*$= & ~printer:Q.Print.(list int)
[1; 6] (find_all_l ~sub:"bc" "abc aabc aab") [1; 6] (find_all_l ~sub:"bc" "abc aabc aab")
[] (find_all_l ~sub:"bc" "abd") [] (find_all_l ~sub:"bc" "abd")
[76] (find_all_l ~sub:"aaaaaa" \
"aabbaabbaaaaabbbbabababababbbbabbbabbaaababbbaaabaabbaabbaaaabbababaaaabbaabaaaaaabbbaaaabababaabaaabbaabaaaabbababbaabbaaabaabbabababbbaabababaaabaaababbbaaaabbbaabaaababbabaababbaabbaaaaabababbabaababbbaaabbabbabababaaaabaaababaaaaabbabbaabbabbbbbbbbbbbbbbaabbabbbbbabbaaabbabbbbabaaaaabbababbbaaaa")
*) *)
val mem : ?start:int -> sub:string -> string -> bool val mem : ?start:int -> sub:string -> string -> bool