diff --git a/benchs/run_benchs.ml b/benchs/run_benchs.ml index 593c8da7..bb7fd23e 100644 --- a/benchs/run_benchs.ml +++ b/benchs/run_benchs.ml @@ -1158,7 +1158,7 @@ module Str = struct (* random string, but always returns the same for a given size *) let rand_str_ ?(among="abcdefgh") n = let module Q = Quickcheck in - let st = Random.State.make [| n |] in + let st = Random.State.make [| n + 17 |] in let gen_c = Q.Gen.oneofl (CCString.to_list among) in Q.Gen.string_size ~gen:gen_c (Q.Gen.return n) st @@ -1213,11 +1213,15 @@ module Str = struct and mk_current = match dir with | `Direct -> fun () -> CCString.find ~sub:needle haystack | `Reverse -> fun () -> CCString.rfind ~sub:needle haystack + and mk_current_compiled = match dir with + | `Direct -> let f = CCString.find ~start:0 ~sub:needle in fun () -> f haystack + | `Reverse -> let f = CCString.rfind ~sub:needle in fun () -> f haystack in assert (mk_naive () = mk_current ()); B.throughputN 3 ~repeat [ "naive", mk_naive, () ; "current", mk_current, () + ; "current_compiled", mk_current_compiled, () ] (* benchmark String.find_all *) @@ -1226,11 +1230,14 @@ module Str = struct let haystack = rand_str_ n in pp_pb needle haystack; let mk_naive () = find_all_l ~sub:needle haystack - and mk_current () = CCString.find_all_l ~sub:needle haystack in + and mk_current () = CCString.find_all_l ~sub:needle haystack + and mk_current_compiled = + let f = CCString.find_all_l ~start:0 ~sub:needle in fun () -> f haystack in assert (mk_naive () = mk_current ()); B.throughputN 3 ~repeat [ "naive", mk_naive, () ; "current", mk_current, () + ; "current_compiled", mk_current_compiled, () ] (* benchmark String.find_all on constant strings *) @@ -1252,8 +1259,7 @@ module Str = struct let () = B.Tree.register ( "string" @>>> [ "find" @>>> - [ "1" @>> app_ints (bench_find ~size:1) [100; 100_000; 500_000] - ; "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000] + [ "3" @>> app_ints (bench_find ~size:3) [100; 100_000; 500_000] ; "5" @>> app_ints (bench_find ~size:5) [100; 100_000; 500_000] ; "15" @>> app_ints (bench_find ~size:15) [100; 100_000; 500_000] ; "50" @>> app_ints (bench_find ~size:50) [100; 100_000; 500_000] diff --git a/src/core/CCString.cppo.ml b/src/core/CCString.cppo.ml index ee2ab50b..5c14f724 100644 --- a/src/core/CCString.cppo.ml +++ b/src/core/CCString.cppo.ml @@ -75,15 +75,15 @@ type _ direction = | Reverse : [`Reverse] direction (* we follow https://en.wikipedia.org/wiki/Knuth–Morris–Pratt_algorithm *) -module KMP = struct - type 'a pattern = { +module Find = struct + type 'a kmp_pattern = { failure : int array; str : string; } (* invariant: [length failure = length str]. We use a phantom type to avoid mixing the directions. *) - let pattern_length p = String.length p.str + let kmp_pattern_length p = String.length p.str (* access the [i]-th element of [s] according to direction [dir] *) let get_ @@ -92,8 +92,8 @@ module KMP = struct | Direct -> String.get | Reverse -> (fun s i -> s.[String.length s - i - 1]) - let compile_ - : type a. dir:a direction -> string -> a pattern + let kmp_compile_ + : type a. dir:a direction -> string -> a kmp_pattern = fun ~dir str -> let len = length str in let get = get_ ~dir in (* how to read elements of the string *) @@ -127,21 +127,21 @@ module KMP = struct (* Format.printf "{@[failure:%a, str:%s@]}@." CCFormat.(array int) failure str; *) { failure; str; } - let compile s = compile_ ~dir:Direct s - let rcompile s = compile_ ~dir:Reverse s + let kmp_compile s = kmp_compile_ ~dir:Direct s + let kmp_rcompile s = kmp_compile_ ~dir:Reverse s (* proper search function. [i] index in [s] [j] index in [pattern] [len] length of [s] *) - let find_ - : type a. dir:a direction -> pattern:a pattern -> string -> int -> int + let kmp_find_ + : type a. dir:a direction -> pattern:a kmp_pattern -> string -> int -> int = fun ~dir ~pattern s idx -> let len = length s in let get = get_ ~dir in let i = ref idx in let j = ref 0 in - let pat_len = pattern_length pattern in + let pat_len = kmp_pattern_length pattern in while !j < pat_len && !i + !j < len do let c = get s (!i + !j) in let expected = get pattern.str !j in @@ -168,34 +168,63 @@ module KMP = struct then !i else -1 - let find ~pattern s i = find_ ~dir:Direct ~pattern s i + let kmp_find ~pattern s i = kmp_find_ ~dir:Direct ~pattern s i - let rfind ~pattern s i = + let kmp_rfind ~pattern s i = let i = String.length s - i - 1 in - let res = find_ ~dir:Reverse ~pattern s i in + let res = kmp_find_ ~dir:Reverse ~pattern s i in (* adjust result: first, [res = string.length s - res -1] to convert back to real indices; then, what we got is actually the position of the end of the pattern, so we subtract the [length of the pattern -1] to obtain the real result. *) if res = ~-1 then res - else (String.length s - res) - pattern_length pattern + else (String.length s - res) - kmp_pattern_length pattern + + type 'a pattern = + | P_char of char + | P_KMP of 'a kmp_pattern + + let pattern_length = function + | P_char _ -> 1 + | P_KMP p -> kmp_pattern_length p + + let compile ~sub : [`Direct] pattern = + if length sub=1 + then P_char sub.[0] + else P_KMP (kmp_compile sub) + + let rcompile ~sub : [`Reverse] pattern = + if length sub=1 + then P_char sub.[0] + else P_KMP (kmp_rcompile sub) + + let find ~pattern s start = match pattern with + | P_char c -> + (try String.index_from s start c with Not_found -> -1) + | P_KMP pattern -> kmp_find ~pattern s start + + let rfind ~pattern s start = match pattern with + | P_char c -> + (try String.rindex_from s start c with Not_found -> -1) + | P_KMP pattern -> kmp_rfind ~pattern s start end -let find ?(start=0) ~sub s = - let pattern = KMP.compile sub in - KMP.find ~pattern s start +let find ?(start=0) ~sub = + let pattern = Find.compile ~sub in + fun s -> Find.find ~pattern s start -let find_all ?(start=0) ~sub s = - let pattern = KMP.compile sub in - let i = ref start in - fun () -> - let res = KMP.find ~pattern s !i in - if res = ~-1 then None - else ( - i := res + KMP.pattern_length pattern; - Some res - ) +let find_all ?(start=0) ~sub = + let pattern = Find.compile ~sub in + fun s -> + let i = ref start in + fun () -> + let res = Find.find ~pattern s !i in + if res = ~-1 then None + else ( + i := res + Find.pattern_length pattern; + Some res + ) let find_all_l ?start ~sub s = let rec aux acc g = match g () with @@ -206,9 +235,9 @@ let find_all_l ?start ~sub s = let mem ?start ~sub s = find ?start ~sub s >= 0 -let rfind ~sub s = - let pattern = KMP.rcompile sub in - KMP.rfind ~pattern s (String.length s-1) +let rfind ~sub = + let pattern = Find.rcompile ~sub in + fun s -> Find.rfind ~pattern s (String.length s-1) (* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *) let replace_at_ ~pos ~len ~by s = @@ -229,11 +258,11 @@ let replace ?(which=`All) ~sub ~by s = if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s | `All -> (* compile search pattern only once *) - let pattern = KMP.compile sub in + let pattern = Find.compile ~sub in let b = Buffer.create (String.length s) in let start = ref 0 in while !start < String.length s do - let i = KMP.find ~pattern s !start in + let i = Find.find ~pattern s !start in if i>=0 then ( (* between last and cur occurrences *) Buffer.add_substring b s !start (i- !start); @@ -256,16 +285,16 @@ module Split = struct | SplitStop -> None | SplitAt prev -> _split_search ~by s prev and _split_search ~by s prev = - let j = KMP.find ~pattern:by s prev in + let j = Find.find ~pattern:by s prev in if j < 0 then Some (SplitStop, prev, String.length s - prev) - else Some (SplitAt (j+KMP.pattern_length by), prev, j-prev) + else Some (SplitAt (j+Find.pattern_length by), prev, j-prev) let _tuple3 x y z = x,y,z let _mkgen ~by s k = let state = ref (SplitAt 0) in - let by = KMP.compile by in + let by = Find.compile ~sub:by in fun () -> match _split ~by s !state with | None -> None @@ -278,7 +307,7 @@ module Split = struct let gen_cpy ~by s = _mkgen ~by s String.sub let _mklist ~by s k = - let by = KMP.compile by in + let by = Find.compile ~sub:by in let rec build acc state = match _split ~by s state with | None -> List.rev acc | Some (state', i, len) -> @@ -291,7 +320,7 @@ module Split = struct let list_cpy ~by s = _mklist ~by s String.sub let _mkklist ~by s k = - let by = KMP.compile by in + let by = Find.compile ~sub:by in let rec make state () = match _split ~by s state with | None -> `Nil | Some (state', i, len) -> @@ -303,7 +332,7 @@ module Split = struct let klist_cpy ~by s = _mkklist ~by s String.sub let _mkseq ~by s f k = - let by = KMP.compile by in + let by = Find.compile ~sub:by in let rec aux state = match _split ~by s state with | None -> () | Some (state', i, len) -> k (f s i len); aux state'