expose CCString.Find for efficient sub-string searching

This commit is contained in:
Simon Cruanes 2016-11-03 18:48:25 +01:00
parent 83b0744a1b
commit 46cee7096c
2 changed files with 46 additions and 19 deletions

View file

@ -212,37 +212,42 @@ module Find = struct
| P_char _ -> 1 | P_char _ -> 1
| P_KMP p -> kmp_pattern_length p | P_KMP p -> kmp_pattern_length p
let compile ~sub : [`Direct] pattern = let compile sub : [`Direct] pattern =
if length sub=1 if length sub=1
then P_char sub.[0] then P_char sub.[0]
else P_KMP (kmp_compile sub) else P_KMP (kmp_compile sub)
let rcompile ~sub : [`Reverse] pattern = let rcompile sub : [`Reverse] pattern =
if length sub=1 if length sub=1
then P_char sub.[0] then P_char sub.[0]
else P_KMP (kmp_rcompile sub) else P_KMP (kmp_rcompile sub)
let find ~(pattern:[`Direct] pattern) s start = match pattern with let find ?(start=0) ~(pattern:[`Direct] pattern) s = match pattern with
| P_char c -> | P_char c ->
(try String.index_from s start c with Not_found -> -1) (try String.index_from s start c with Not_found -> -1)
| P_KMP pattern -> kmp_find ~pattern s start | P_KMP pattern -> kmp_find ~pattern s start
let rfind ~(pattern:[`Reverse] pattern) s start = match pattern with let rfind ?start ~(pattern:[`Reverse] pattern) s =
| P_char c -> let start = match start with
| Some n -> n
| None -> String.length s - 1
in
match pattern with
| P_char c ->
(try String.rindex_from s start c with Not_found -> -1) (try String.rindex_from s start c with Not_found -> -1)
| P_KMP pattern -> kmp_rfind ~pattern s start | P_KMP pattern -> kmp_rfind ~pattern s start
end end
let find ?(start=0) ~sub = let find ?(start=0) ~sub =
let pattern = Find.compile ~sub in let pattern = Find.compile sub in
fun s -> Find.find ~pattern s start fun s -> Find.find ~pattern s ~start
let find_all ?(start=0) ~sub = let find_all ?(start=0) ~sub =
let pattern = Find.compile ~sub in let pattern = Find.compile sub in
fun s -> fun s ->
let i = ref start in let i = ref start in
fun () -> fun () ->
let res = Find.find ~pattern s !i in let res = Find.find ~pattern s ~start:!i in
if res = ~-1 then None if res = ~-1 then None
else ( else (
i := res + 1; (* possible overlap *) i := res + 1; (* possible overlap *)
@ -259,8 +264,8 @@ let find_all_l ?start ~sub s =
let mem ?start ~sub s = find ?start ~sub s >= 0 let mem ?start ~sub s = find ?start ~sub s >= 0
let rfind ~sub = let rfind ~sub =
let pattern = Find.rcompile ~sub in let pattern = Find.rcompile sub in
fun s -> Find.rfind ~pattern s (String.length s-1) fun s -> Find.rfind ~pattern s ~start:(String.length s-1)
(* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *) (* Replace substring [s.[pos]....s.[pos+len-1]] by [by] in [s] *)
let replace_at_ ~pos ~len ~by s = let replace_at_ ~pos ~len ~by s =
@ -281,11 +286,11 @@ let replace ?(which=`All) ~sub ~by s =
if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s if i>=0 then replace_at_ ~pos:i ~len:(String.length sub) ~by s else s
| `All -> | `All ->
(* compile search pattern only once *) (* compile search pattern only once *)
let pattern = Find.compile ~sub in let pattern = Find.compile sub in
let b = Buffer.create (String.length s) in let b = Buffer.create (String.length s) in
let start = ref 0 in let start = ref 0 in
while !start < String.length s do while !start < String.length s do
let i = Find.find ~pattern s !start in let i = Find.find ~pattern s ~start:!start in
if i>=0 then ( if i>=0 then (
(* between last and cur occurrences *) (* between last and cur occurrences *)
Buffer.add_substring b s !start (i- !start); Buffer.add_substring b s !start (i- !start);
@ -308,7 +313,7 @@ module Split = struct
| SplitStop -> None | SplitStop -> None
| SplitAt prev -> _split_search ~by s prev | SplitAt prev -> _split_search ~by s prev
and _split_search ~by s prev = and _split_search ~by s prev =
let j = Find.find ~pattern:by s prev in let j = Find.find ~pattern:by s ~start:prev in
if j < 0 if j < 0
then Some (SplitStop, prev, String.length s - prev) then Some (SplitStop, prev, String.length s - prev)
else Some (SplitAt (j+Find.pattern_length by), prev, j-prev) else Some (SplitAt (j+Find.pattern_length by), prev, j-prev)
@ -317,7 +322,7 @@ module Split = struct
let _mkgen ~by s k = let _mkgen ~by s k =
let state = ref (SplitAt 0) in let state = ref (SplitAt 0) in
let by = Find.compile ~sub:by in let by = Find.compile by in
fun () -> fun () ->
match _split ~by s !state with match _split ~by s !state with
| None -> None | None -> None
@ -330,7 +335,7 @@ module Split = struct
let gen_cpy ~by s = _mkgen ~by s String.sub let gen_cpy ~by s = _mkgen ~by s String.sub
let _mklist ~by s k = let _mklist ~by s k =
let by = Find.compile ~sub:by in let by = Find.compile by in
let rec build acc state = match _split ~by s state with let rec build acc state = match _split ~by s state with
| None -> List.rev acc | None -> List.rev acc
| Some (state', i, len) -> | Some (state', i, len) ->
@ -343,7 +348,7 @@ module Split = struct
let list_cpy ~by s = _mklist ~by s String.sub let list_cpy ~by s = _mklist ~by s String.sub
let _mkklist ~by s k = let _mkklist ~by s k =
let by = Find.compile ~sub:by in let by = Find.compile by in
let rec make state () = match _split ~by s state with let rec make state () = match _split ~by s state with
| None -> `Nil | None -> `Nil
| Some (state', i, len) -> | Some (state', i, len) ->
@ -355,7 +360,7 @@ module Split = struct
let klist_cpy ~by s = _mkklist ~by s String.sub let klist_cpy ~by s = _mkklist ~by s String.sub
let _mkseq ~by s f k = let _mkseq ~by s f k =
let by = Find.compile ~sub:by in let by = Find.compile by in
let rec aux state = match _split ~by s state with let rec aux state = match _split ~by s state with
| None -> () | None -> ()
| Some (state', i, len) -> k (f s i len); aux state' | Some (state', i, len) -> k (f s i len); aux state'

View file

@ -399,6 +399,28 @@ val uppercase_ascii : string -> string
val lowercase_ascii : string -> string val lowercase_ascii : string -> string
(** See {!String}. @since 0.18 *) (** See {!String}. @since 0.18 *)
(** {2 Finding}
A relatively efficient algorithm for finding sub-strings
@since 1.0 *)
module Find : sig
type _ pattern
val compile : string -> [ `Direct ] pattern
val rcompile : string -> [ `Reverse ] pattern
val find : ?start:int -> pattern:[`Direct] pattern -> string -> int
(** Search for [pattern] in the string, left-to-right
@return the offset of the first match, -1 otherwise
@param start offset in string at which we start *)
val rfind : ?start:int -> pattern:[`Reverse] pattern -> string -> int
(** Search for [pattern] in the string, right-to-left
@return the offset of the start of the first match from the right, -1 otherwise
@param start right-offset in string at which we start *)
end
(** {2 Splitting} *) (** {2 Splitting} *)