diff --git a/benchs/run_benchs.ml b/benchs/run_benchs.ml index 3daf52ae..593c8da7 100644 --- a/benchs/run_benchs.ml +++ b/benchs/run_benchs.ml @@ -1156,13 +1156,12 @@ end module Str = struct (* random string, but always returns the same for a given size *) - let rand_str_ n = + let rand_str_ ?(among="abcdefgh") n = let module Q = Quickcheck in let st = Random.State.make [| n |] in - let gen_c = Q.Gen.oneofl (CCString.to_list "abcdefghijkl") in + let gen_c = Q.Gen.oneofl (CCString.to_list among) in Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st - (* note: inefficient *) let find ?(start=0) ~sub s = let n = String.length sub in let i = ref start in @@ -1175,7 +1174,6 @@ module Str = struct with Exit -> !i - (* note: inefficient *) let rfind ~sub s = let n = String.length sub in let i = ref (String.length s - n) in @@ -1235,6 +1233,19 @@ module Str = struct ; "current", mk_current, () ] + (* benchmark String.find_all on constant strings *) + let bench_find_all_special ~size n = + let needle = CCString.repeat "a" (size-1) ^ "b" in + let haystack = rand_str_ ~among:"ab" n in + pp_pb needle haystack; + let mk_naive () = find_all_l ~sub:needle haystack + and mk_current () = CCString.find_all_l ~sub:needle haystack in + assert (mk_naive () = mk_current ()); + B.throughputN 3 ~repeat + [ "naive", mk_naive, () + ; "current", mk_current, () + ] + let bench_find = bench_find_ ~dir:`Direct let bench_rfind = bench_find_ ~dir:`Reverse @@ -1242,6 +1253,7 @@ module Str = struct "string" @>>> [ "find" @>>> [ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000] + ; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000] ; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000] ; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000] @@ -1249,13 +1261,16 @@ module Str = struct ]; "find_all" @>>> [ "1" @>> app_ints (bench_find_all ~size:1) [100; 100_000; 500_000] + ; "3" @>> app_ints (bench_find_all ~size:3) [100; 100_000; 500_000] ; "5" @>> app_ints (bench_find_all ~size:5) [100; 100_000; 500_000] ; "15" @>> app_ints (bench_find_all ~size:15) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_find_all ~size:50) [100; 100_000; 500_000] ; "500" @>> app_ints (bench_find_all ~size:500) [100_000; 500_000] + ; "special" @>> app_ints (bench_find_all_special ~size:6) [100_000; 500_000] ]; "rfind" @>>> - [ "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000] + [ "3" @>> app_ints (bench_rfind ~size:3) [100; 100_000; 500_000] + ; "15" @>> app_ints (bench_rfind ~size:15) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_rfind ~size:50) [100; 100_000; 500_000] ; "500" @>> app_ints (bench_rfind ~size:500) [100_000; 500_000] ]; diff --git a/src/core/CCString.cppo.ml b/src/core/CCString.cppo.ml index 5bca58d8..ee2ab50b 100644 --- a/src/core/CCString.cppo.ml +++ b/src/core/CCString.cppo.ml @@ -74,6 +74,7 @@ type _ direction = | Direct : [`Direct] direction | Reverse : [`Reverse] direction +(* we follow https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm *) module KMP = struct type 'a pattern = { failure : int array; @@ -88,7 +89,7 @@ module KMP = struct let get_ : type a. dir:a direction -> string -> int -> char = fun ~dir -> match dir with - | Direct -> (fun s i -> s.[i]) + | Direct -> String.get | Reverse -> (fun s i -> s.[String.length s - i - 1]) let compile_ @@ -102,26 +103,28 @@ module KMP = struct | _ -> (* at least 2 elements, the algorithm can work *) let failure = Array.make len 0 in + failure.(0) <- -1; (* i: current index in str *) - let i = ref 1 in + let i = ref 2 in (* j: index of candidate substring *) let j = ref 0 in - while !i < len-1 do + while !i < len do match !j with - | _ when get str !i = get str !j -> + | _ when get str (!i-1) = get str !j -> (* substring starting at !j continues matching current char *) - i := !i+1; - j := !j+1; + incr j; failure.(!i) <- !j; + incr i; | 0 -> (* back to the beginning *) - i := !i+1; failure.(!i) <- 0; + incr i; | _ -> (* fallback for the prefix string *) assert (!j > 0); j := failure.(!j) done; + (* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *) { failure; str; } let compile s = compile_ ~dir:Direct s @@ -139,23 +142,30 @@ module KMP = struct let i = ref idx in let j = ref 0 in let pat_len = pattern_length pattern in - while !i < len && !j < pat_len do - let c = get s !i in + while !j < pat_len && !i + !j < len do + let c = get s (!i + !j) in let expected = get pattern.str !j in if c = expected then ( (* char matches *) - i := !i + 1; j := !j + 1 + incr j; ) else ( - if !j=0 - then (* beginning of the pattern *) - i := !i + 1 - else (* follow the failure link *) - j := pattern.failure.(!j) + let fail_offset = pattern.failure.(!j) in + if fail_offset >= 0 + then ( + assert (fail_offset < !j); + (* follow the failure link *) + i := !i + !j - fail_offset; + j := fail_offset + ) else ( + (* beginning of pattern *) + j := 0; + incr i + ) ) done; if !j = pat_len - then !i - pat_len + then !i else -1 let find ~pattern s i = find_ ~dir:Direct ~pattern s i @@ -242,29 +252,20 @@ module Split = struct | SplitStop | SplitAt of int (* previous *) - (* [by_j... prefix of s_i...] ? *) - let rec _is_prefix ~by s i j = - j = String.length by - || - ( i < String.length s && - s.[i] = by.[j] && - _is_prefix ~by s (i+1) (j+1) - ) - let rec _split ~by s state = match state with | SplitStop -> None - | SplitAt prev -> _split_search ~by s prev prev - and _split_search ~by s prev i = - if i >= String.length s + | SplitAt prev -> _split_search ~by s prev + and _split_search ~by s prev = + let j = KMP.find ~pattern:by s prev in + if j < 0 then Some (SplitStop, prev, String.length s - prev) - else if _is_prefix ~by s i 0 - then Some (SplitAt (i+String.length by), prev, i-prev) - else _split_search ~by s prev (i+1) + else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev) let _tuple3 x y z = x,y,z let _mkgen ~by s k = let state = ref (SplitAt 0) in + let by = KMP.compile by in fun () -> match _split ~by s !state with | None -> None @@ -277,6 +278,7 @@ module Split = struct let gen_cpy ~by s = _mkgen ~by s String.sub let _mklist ~by s k = + let by = KMP.compile by in let rec build acc state = match _split ~by s state with | None -> List.rev acc | Some (state', i, len) -> @@ -289,6 +291,7 @@ module Split = struct let list_cpy ~by s = _mklist ~by s String.sub let _mkklist ~by s k = + let by = KMP.compile by in let rec make state () = match _split ~by s state with | None -> `Nil | Some (state', i, len) -> @@ -300,6 +303,7 @@ module Split = struct let klist_cpy ~by s = _mkklist ~by s String.sub let _mkseq ~by s f k = + let by = KMP.compile by in let rec aux state = match _split ~by s state with | None -> () | Some (state', i, len) -> k (f s i len); aux state' diff --git a/src/core/CCString.mli b/src/core/CCString.mli index cfdfe959..930eadc3 100644 --- a/src/core/CCString.mli +++ b/src/core/CCString.mli @@ -122,6 +122,8 @@ val find_all_l : ?start:int -> sub:string -> string -> int list (*$= & ~printer:Q.Print.(list int) [1; 6] (find_all_l ~sub:"bc" "abc aabc aab") [] (find_all_l ~sub:"bc" "abd") + [76] (find_all_l ~sub:"aaaaaa" \ + "aabbaabbaaaaabbbbabababababbbbabbbabbaaababbbaaabaabbaabbaaaabbababaaaabbaabaaaaaabbbaaaabababaabaaabbaabaaaabbababbaabbaaabaabbabababbbaabababaaabaaababbbaaaabbbaabaaababbabaababbaabbaaaaabababbabaababbbaaabbabbabababaaaabaaababaaaaabbabbaabbabbbbbbbbbbbbbbaabbabbbbbabbaaabbabbbbabaaaaabbababbbaaaa") *) val mem : ?start:int -> sub:string -> string -> bool