mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 11:15:31 -05:00
feat(ccparse): expose pos to get current pos; improve perf
perf of obtaining many positions is now better because we cache line offsets, which means computing a line,col pair is just a O(ln n) bisect away.
This commit is contained in:
parent
c33477c397
commit
9123f7907f
2 changed files with 97 additions and 46 deletions
|
|
@ -106,10 +106,29 @@ open CCShims_
|
||||||
()
|
()
|
||||||
*)
|
*)
|
||||||
|
|
||||||
(* TODO: [type position = {state: state; i: int}] and recompute line, col
|
module Memo_tbl = Hashtbl.Make(struct
|
||||||
on demand *)
|
type t = int * int (* id of parser, position *)
|
||||||
|
let equal ((a,b):t)(c,d) = a=c && b=d
|
||||||
|
let hash = Hashtbl.hash
|
||||||
|
end)
|
||||||
|
|
||||||
|
module Memo_state = struct
|
||||||
|
(* table of closures, used to implement universal type *)
|
||||||
|
type t = (unit -> unit) Memo_tbl.t
|
||||||
|
|
||||||
|
(* unique ID for each parser *)
|
||||||
|
let id_ = ref 0
|
||||||
|
end
|
||||||
|
|
||||||
|
(* state common to all parser instances *)
|
||||||
|
type common_state = {
|
||||||
|
str: string;
|
||||||
|
mutable line_offsets: int array option;
|
||||||
|
mutable memo: Memo_state.t option;
|
||||||
|
}
|
||||||
|
|
||||||
type position = {
|
type position = {
|
||||||
pos_buffer: string;
|
pos_cs: common_state;
|
||||||
pos_offset: int;
|
pos_offset: int;
|
||||||
mutable pos_lc: (int * int) option;
|
mutable pos_lc: (int * int) option;
|
||||||
}
|
}
|
||||||
|
|
@ -117,27 +136,50 @@ type position = {
|
||||||
module Position = struct
|
module Position = struct
|
||||||
type t = position
|
type t = position
|
||||||
|
|
||||||
(* actually re-compute line and column from the buffer *)
|
let compute_line_offsets_ (s:string): int array =
|
||||||
let compute_line_and_col_ (s:string) (off:int) : int * int =
|
let lines = CCVector.create() in
|
||||||
let i = ref 0 in
|
let i = ref 0 in
|
||||||
let continue = ref true in
|
CCVector.push lines 0;
|
||||||
let line = ref 1 in
|
while !i < String.length s do
|
||||||
let col = ref 1 in
|
|
||||||
while !continue && !i < off && !i < String.length s do
|
|
||||||
match String.index_from s !i '\n' with
|
match String.index_from s !i '\n' with
|
||||||
| exception Not_found ->
|
| exception Not_found -> i := String.length s
|
||||||
col := off - !i; continue := false;
|
| j ->
|
||||||
| j when j > off ->
|
CCVector.push lines j;
|
||||||
col := off - !i; continue := false;
|
i := j + 1
|
||||||
| j -> incr line; i := j+1;
|
|
||||||
done;
|
done;
|
||||||
!line, !col
|
CCVector.to_array lines
|
||||||
|
;;
|
||||||
|
|
||||||
|
let line_offsets_ cs = match cs.line_offsets with
|
||||||
|
| Some lines -> lines
|
||||||
|
| None ->
|
||||||
|
let lines = compute_line_offsets_ cs.str in
|
||||||
|
cs.line_offsets <- Some lines;
|
||||||
|
lines
|
||||||
|
|
||||||
|
let int_cmp_ : int -> int -> int = compare
|
||||||
|
|
||||||
|
(* TODO: use pos_cs.line_offsets *)
|
||||||
|
(* actually re-compute line and column from the buffer *)
|
||||||
|
let compute_line_and_col_ (cs:common_state) (off:int) : int * int =
|
||||||
|
let offsets = line_offsets_ cs in
|
||||||
|
assert (offsets.(0) = 0);
|
||||||
|
begin match CCArray.bsearch ~cmp:int_cmp_ off offsets with
|
||||||
|
| `At 0 -> 0, 0
|
||||||
|
| `At n -> (n-1), off - offsets.(n-1) - 1
|
||||||
|
| `Just_after n -> n, off - offsets.(n)
|
||||||
|
| `Empty -> assert false
|
||||||
|
| `All_bigger -> assert false (* off >= 0, and offsets[0] == 0 *)
|
||||||
|
| `All_lower ->
|
||||||
|
let n = Array.length offsets - 1 in
|
||||||
|
n, off - offsets.(n)
|
||||||
|
end
|
||||||
|
|
||||||
let line_and_column self =
|
let line_and_column self =
|
||||||
match self.pos_lc with
|
match self.pos_lc with
|
||||||
| Some tup -> tup
|
| Some tup -> tup
|
||||||
| None ->
|
| None ->
|
||||||
let tup = compute_line_and_col_ self.pos_buffer self.pos_offset in
|
let tup = compute_line_and_col_ self.pos_cs self.pos_offset in
|
||||||
self.pos_lc <- Some tup; (* save *)
|
self.pos_lc <- Some tup; (* save *)
|
||||||
tup
|
tup
|
||||||
|
|
||||||
|
|
@ -169,26 +211,11 @@ end
|
||||||
|
|
||||||
type +'a or_error = ('a, Error.t) result
|
type +'a or_error = ('a, Error.t) result
|
||||||
|
|
||||||
module Memo_tbl = Hashtbl.Make(struct
|
|
||||||
type t = int * int (* id of parser, position *)
|
|
||||||
let equal ((a,b):t)(c,d) = a=c && b=d
|
|
||||||
let hash = Hashtbl.hash
|
|
||||||
end)
|
|
||||||
|
|
||||||
module Memo_state = struct
|
|
||||||
(* table of closures, used to implement universal type *)
|
|
||||||
type t = (unit -> unit) Memo_tbl.t
|
|
||||||
|
|
||||||
(* unique ID for each parser *)
|
|
||||||
let id_ = ref 0
|
|
||||||
end
|
|
||||||
|
|
||||||
(** Purely functional state passed around *)
|
(** Purely functional state passed around *)
|
||||||
type state = {
|
type state = {
|
||||||
str: string; (* the input *)
|
cs: common_state;
|
||||||
i: int; (* offset in [str] *)
|
i: int; (* offset in [str] *)
|
||||||
j: int; (* end pointer in [str], excluded. [len = j-i] *)
|
j: int; (* end pointer in [str], excluded. [len = j-i] *)
|
||||||
memo : Memo_state.t option ref; (* Memoization table, if any *)
|
|
||||||
}
|
}
|
||||||
(* FIXME: replace memo with:
|
(* FIXME: replace memo with:
|
||||||
[global : global_st ref]
|
[global : global_st ref]
|
||||||
|
|
@ -219,17 +246,16 @@ let[@inline] const_str_ x () : string = x
|
||||||
|
|
||||||
let state_of_string str =
|
let state_of_string str =
|
||||||
let s = {
|
let s = {
|
||||||
str;
|
cs={str; memo=None; line_offsets=None};
|
||||||
i=0;
|
i=0;
|
||||||
j=String.length str;
|
j=String.length str;
|
||||||
memo=ref None;
|
|
||||||
} in
|
} in
|
||||||
s
|
s
|
||||||
|
|
||||||
let[@inline] is_done st = st.i >= st.j
|
let[@inline] is_done st = st.i >= st.j
|
||||||
let[@inline] cur st = st.str.[st.i]
|
let[@inline] cur st = st.cs.str.[st.i]
|
||||||
|
|
||||||
let pos_of_st_ st : position = {pos_buffer=st.str; pos_offset=st.i; pos_lc=None}
|
let pos_of_st_ st : position = {pos_cs=st.cs; pos_offset=st.i; pos_lc=None}
|
||||||
let mk_error_ st msg : Error.t = {Error.msg; pos=pos_of_st_ st}
|
let mk_error_ st msg : Error.t = {Error.msg; pos=pos_of_st_ st}
|
||||||
|
|
||||||
(* consume one char, passing it to [ok]. *)
|
(* consume one char, passing it to [ok]. *)
|
||||||
|
|
@ -238,7 +264,7 @@ let consume_ st ~ok ~err =
|
||||||
let msg = const_str_ "unexpected end of input" in
|
let msg = const_str_ "unexpected end of input" in
|
||||||
err (mk_error_ st msg)
|
err (mk_error_ st msg)
|
||||||
) else (
|
) else (
|
||||||
let c = st.str.[st.i] in
|
let c = st.cs.str.[st.i] in
|
||||||
ok {st with i=st.i + 1} c
|
ok {st with i=st.i + 1} c
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -367,6 +393,25 @@ let with_pos p : _ t = {
|
||||||
~err
|
~err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let pos : _ t = {
|
||||||
|
run=fun st ~ok ~err:_ -> ok st (pos_of_st_ st)
|
||||||
|
}
|
||||||
|
|
||||||
|
(*$= & ~printer:Q.Print.(pair int int)
|
||||||
|
(0,5) (let p = any_char_n 5 *> pos in \
|
||||||
|
match parse_string p "abcde " with \
|
||||||
|
| Ok p -> Position.line_and_column p \
|
||||||
|
| Error _ -> assert false)
|
||||||
|
*)
|
||||||
|
|
||||||
|
(*$= & ~printer:Q.Print.(list @@ pair int int)
|
||||||
|
[(0,2); (1,3); (2,1); (3,0); (4,0); (5,2)] \
|
||||||
|
(let p = each_line (skip_space *> pos) in \
|
||||||
|
match parse_string p " a\n b\nc\n\n\n a" with \
|
||||||
|
| Ok ps -> List.map Position.line_and_column ps \
|
||||||
|
| Error _ -> assert false)
|
||||||
|
*)
|
||||||
|
|
||||||
(* a slice is just a state, which makes {!recurse} quite easy. *)
|
(* a slice is just a state, which makes {!recurse} quite easy. *)
|
||||||
type slice = state
|
type slice = state
|
||||||
|
|
||||||
|
|
@ -374,14 +419,14 @@ module Slice = struct
|
||||||
type t = slice
|
type t = slice
|
||||||
let length sl = sl.j - sl.i
|
let length sl = sl.j - sl.i
|
||||||
let is_empty sl = sl.i = sl.j
|
let is_empty sl = sl.i = sl.j
|
||||||
let to_string sl = String.sub sl.str sl.i (length sl)
|
let to_string sl = String.sub sl.cs.str sl.i (length sl)
|
||||||
end
|
end
|
||||||
|
|
||||||
let recurse slice p : _ t = {
|
let recurse slice p : _ t = {
|
||||||
run=fun _st ~ok ~err ->
|
run=fun _st ~ok ~err ->
|
||||||
(* make sure these states are related. all slices share the
|
(* make sure these states are related. all slices share the
|
||||||
same reference as the initial state they derive from. *)
|
same reference as the initial state they derive from. *)
|
||||||
assert CCShims_.Stdlib.(_st.memo == slice.memo);
|
assert CCShims_.Stdlib.(_st.cs == slice.cs);
|
||||||
p.run slice ~ok ~err
|
p.run slice ~ok ~err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -807,7 +852,7 @@ let sep1 ~by p =
|
||||||
let lookahead p : _ t = {
|
let lookahead p : _ t = {
|
||||||
run=fun st ~ok ~err ->
|
run=fun st ~ok ~err ->
|
||||||
p.run st
|
p.run st
|
||||||
~ok:(fun _st x -> ok st x) (* discard old state *)
|
~ok:(fun _st x -> ok st x) (* discard p's new state *)
|
||||||
~err
|
~err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -820,7 +865,7 @@ let lookahead_ignore p : _ t = {
|
||||||
|
|
||||||
let set_current_slice sl : _ t = {
|
let set_current_slice sl : _ t = {
|
||||||
run=fun _st ~ok ~err:_ ->
|
run=fun _st ~ok ~err:_ ->
|
||||||
assert CCShims_.Stdlib.(_st.memo == sl.memo);
|
assert CCShims_.Stdlib.(_st.cs == sl.cs);
|
||||||
ok sl () (* jump to slice *)
|
ok sl () (* jump to slice *)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -841,7 +886,7 @@ let split_1 ~on_char : _ t = {
|
||||||
if st.i >= st.j then (
|
if st.i >= st.j then (
|
||||||
ok st (st, None)
|
ok st (st, None)
|
||||||
) else (
|
) else (
|
||||||
match String.index_from st.str st.i on_char with
|
match String.index_from st.cs.str st.i on_char with
|
||||||
| j ->
|
| j ->
|
||||||
let x = {st with j} in
|
let x = {st with j} in
|
||||||
let y = {st with i=min st.j (j+1)} in
|
let y = {st with i=min st.j (j+1)} in
|
||||||
|
|
@ -950,11 +995,11 @@ let memo (type a) (p:a t) : a t =
|
||||||
let r = ref None in (* used for universal encoding *)
|
let r = ref None in (* used for universal encoding *)
|
||||||
|
|
||||||
{run=fun st ~ok ~err ->
|
{run=fun st ~ok ~err ->
|
||||||
let tbl = match !(st.memo) with
|
let tbl = match st.cs.memo with
|
||||||
| Some t -> t
|
| Some t -> t
|
||||||
| None ->
|
| None ->
|
||||||
let tbl = Memo_tbl.create 32 in
|
let tbl = Memo_tbl.create 32 in
|
||||||
st.memo := Some tbl;
|
st.cs.memo <- Some tbl;
|
||||||
tbl
|
tbl
|
||||||
in
|
in
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -72,10 +72,10 @@ module Position : sig
|
||||||
type t = position
|
type t = position
|
||||||
|
|
||||||
val line : t -> int
|
val line : t -> int
|
||||||
(** Line number *)
|
(** Line number, 0 based *)
|
||||||
|
|
||||||
val column : t -> int
|
val column : t -> int
|
||||||
(** Column number *)
|
(** Column number, 0 based *)
|
||||||
|
|
||||||
val line_and_column : t -> int * int
|
val line_and_column : t -> int * int
|
||||||
(** Line and column number *)
|
(** Line and column number *)
|
||||||
|
|
@ -178,6 +178,12 @@ val set_error_message : string -> 'a t -> 'a t
|
||||||
position. The internal error message of [p] is just discarded.
|
position. The internal error message of [p] is just discarded.
|
||||||
@since 3.6 *)
|
@since 3.6 *)
|
||||||
|
|
||||||
|
val pos : position t
|
||||||
|
(** [pos] returns the current position in the buffer.
|
||||||
|
|
||||||
|
{b EXPERIMENTAL}
|
||||||
|
@since NEXT_RELEASE *)
|
||||||
|
|
||||||
val with_pos : 'a t -> ('a * position) t
|
val with_pos : 'a t -> ('a * position) t
|
||||||
(** [with_pos p] behaves like [p], but returns the (starting) position
|
(** [with_pos p] behaves like [p], but returns the (starting) position
|
||||||
along with [p]'s result.
|
along with [p]'s result.
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue