mirror of
https://github.com/c-cube/linol.git
synced 2025-12-10 13:14:15 -05:00
401 lines
15 KiB
OCaml
401 lines
15 KiB
OCaml
(*---------------------------------------------------------------------------
|
|
Copyright (c) 2021 The cmarkit programmers. All rights reserved.
|
|
Distributed under the ISC license, see terms at the end of the file.
|
|
---------------------------------------------------------------------------*)
|
|
|
|
(** Low-level internal tools. *)
|
|
|
|
(** Heterogeneous dictionaries.
|
|
|
|
Used by {!Cmarkit.Meta}. *)
|
|
module Dict : sig
|
|
type 'a key
|
|
val key : unit -> 'a key
|
|
type t
|
|
val empty : t
|
|
val mem : 'a key -> t -> bool
|
|
val add : 'a key -> 'a -> t -> t
|
|
val tag : unit key -> t -> t
|
|
val remove : 'a key -> t -> t
|
|
val find : 'a key -> t -> 'a option
|
|
end
|
|
|
|
(** Text locations.
|
|
|
|
See {!Cmarkit.Textloc} for documentation. *)
|
|
module Textloc : sig
|
|
type fpath = string
|
|
val file_none : fpath
|
|
|
|
type byte_pos = int
|
|
val byte_pos_none : byte_pos
|
|
|
|
type line_num = int
|
|
val line_num_none : line_num
|
|
|
|
type line_pos = line_num * byte_pos
|
|
val line_pos_first : line_pos
|
|
val line_pos_none : line_pos
|
|
|
|
type t
|
|
val none : t
|
|
val v :
|
|
file:fpath -> first_byte:byte_pos -> last_byte:byte_pos ->
|
|
first_line:line_pos -> last_line:line_pos -> t
|
|
|
|
val file : t -> fpath
|
|
val first_byte : t -> byte_pos
|
|
val last_byte : t -> byte_pos
|
|
val first_line : t -> line_pos
|
|
val last_line : t -> line_pos
|
|
val is_none : t -> bool
|
|
val is_empty : t -> bool
|
|
val equal : t -> t -> bool
|
|
val compare : t -> t -> int
|
|
val set_first : t -> first_byte:byte_pos -> first_line:line_pos -> t
|
|
val set_last : t -> last_byte:byte_pos -> last_line:line_pos -> t
|
|
val to_first : t -> t
|
|
val to_last : t -> t
|
|
val before : t -> t
|
|
val after : t -> t
|
|
val span : t -> t -> t
|
|
val reloc : first:t -> last:t -> t
|
|
val pp_ocaml : Format.formatter -> t -> unit
|
|
val pp_gnu : Format.formatter -> t -> unit
|
|
val pp : Format.formatter -> t -> unit
|
|
val pp_dump : Format.formatter -> t -> unit
|
|
end
|
|
|
|
(** Node metadata.
|
|
|
|
See {!Cmarkit.Meta} for documentation. *)
|
|
module Meta : sig
|
|
type id = int
|
|
type t
|
|
|
|
val none : t
|
|
val make : ?textloc:Textloc.t -> unit -> t
|
|
val id : t -> id
|
|
|
|
val textloc : t -> Textloc.t
|
|
val with_textloc : keep_id:bool -> t -> Textloc.t -> t
|
|
|
|
val equal : t -> t -> bool
|
|
val compare : t -> t -> int
|
|
val is_none : t -> bool
|
|
|
|
type 'a key
|
|
val key : unit -> 'a key
|
|
val mem : 'a key -> t -> bool
|
|
val add : 'a key -> 'a -> t -> t
|
|
val tag : unit key -> t -> t
|
|
val remove : 'a key -> t -> t
|
|
val find : 'a key -> t -> 'a option
|
|
end
|
|
|
|
type line_span =
|
|
{ line_pos : Textloc.line_pos;
|
|
first : Textloc.byte_pos;
|
|
last : Textloc.byte_pos }
|
|
(** The type for line spans. A line position, the first and last
|
|
bytes of the span. If the former is greater than the latter,
|
|
the span is empty. *)
|
|
|
|
type line_start = Textloc.byte_pos
|
|
(** The type for denoting a line start inside
|
|
a CommonMark container (i.e. may not match the text line's first
|
|
character). *)
|
|
|
|
type rev_spans = (line_start * line_span) list
|
|
(** A reversed list of line spans, tupled with the byte position on
|
|
where the line starts (inside a CommonMark container). The
|
|
[line_start] is the start of line in the container, the
|
|
[line_span] has the actual data. The characters in the
|
|
\[[line_start];[line_span.first - 1]\] are blanks. *)
|
|
|
|
type 'a next_line = 'a -> ('a * line_span) option
|
|
(** The type for getting a new line of input. This is used by certain
|
|
multi-line matchers (e.g. raw HTML). *)
|
|
|
|
(** {1:ascii US-ASCII matching} *)
|
|
|
|
(** US-ASCII matching. *)
|
|
module Ascii : sig
|
|
val is_control : char -> bool
|
|
val is_letter : char -> bool
|
|
val is_upper : char -> bool
|
|
val is_lower : char -> bool
|
|
val is_digit : char -> bool
|
|
val is_hex_digit : char -> bool
|
|
val hex_digit_to_int : char -> int
|
|
val is_alphanum : char -> bool
|
|
val is_white : char -> bool
|
|
val is_punct : char -> bool
|
|
val is_blank : char -> bool
|
|
val caseless_starts_with : prefix:string -> string -> bool
|
|
end
|
|
|
|
(** {1:uchar Unicode matching} *)
|
|
|
|
val prev_uchar : string -> first:int -> before:int -> Uchar.t
|
|
(** [prev_uchar s ~first ~before] is the first Unicode character before
|
|
byte position [before] in the range \[[first];[before-1]\]. If
|
|
[before <= first], U+0020 is returned (a Unicode whitespace character). *)
|
|
|
|
val next_uchar : string -> last:int -> after:int -> Uchar.t
|
|
(** [next_uchar s ~last ~after] is the next Unicode character after
|
|
byte position [after] in the range \[[after+1];[last]\]. If [after
|
|
>= last], U+0020 is returned (a Unicode whitespace character). *)
|
|
|
|
(** {1:content Textual content} *)
|
|
|
|
(** Textual content.
|
|
|
|
Ensures UTF-8 validity, unescapes, resolves numeric and named character
|
|
references. *)
|
|
module Text : sig
|
|
val utf_8_clean_unesc_unref :
|
|
Buffer.t -> string -> first:int -> last:int -> string
|
|
(** [utf_8_clean_unesc_unref b s ~first ~last] unescapes CommonMark
|
|
escapes, resolves HTML entity and character references in the
|
|
given span and replaces U+0000 and UTF-8 decoding errors by
|
|
{!Uchar.rep}. [b] is used as scratch space. If [last > first]
|
|
or [first] and [last] are not valid indices of [s] is [""]. *)
|
|
|
|
val utf_8_clean_unref :
|
|
Buffer.t -> string -> first:int -> last:int -> string
|
|
(** [utf_8_clean_unref b s ~first ~last] is like
|
|
{!utf_8_clean_unesc_unref} but does not unsescape. *)
|
|
|
|
val utf_8_clean_raw :
|
|
?pad:int -> Buffer.t -> string -> first:int -> last:int -> string
|
|
(** [utf_8_clean_raw b s ~first ~last] replaces U+0000 and UTF-8
|
|
decoding errors by {!Uchar.rep}. [b] is used as scratch space.
|
|
[pad] (defaults to [0]) specifies a number of U+0020 spaces to prepend.
|
|
If [last > first] or [first] and [last] are not valid indices of
|
|
[s] is either [""] or the padded string. *)
|
|
end
|
|
|
|
(** {1:result Result types} *)
|
|
|
|
type indent = int
|
|
(** The type for indentation magnitude. *)
|
|
|
|
type byte_pos = Textloc.byte_pos
|
|
(** The type for positions. *)
|
|
|
|
type first = Textloc.byte_pos
|
|
(** The type for the first first byte position of a parsed construct. *)
|
|
|
|
type last = Textloc.byte_pos
|
|
(** The type for the last byte position of a parsed construct. *)
|
|
|
|
type next = Textloc.byte_pos
|
|
(** The type for a byte position after a parsed construct. The byte position
|
|
may be invalid (end of input range) or on the newline. *)
|
|
|
|
type heading_level = int
|
|
(** The type for heading levels. *)
|
|
|
|
(** {1:blanks Newlines, runs, blanks and indents} *)
|
|
|
|
val run_of : char:char -> string -> last:byte_pos -> start:byte_pos -> last
|
|
(** [run_of ~char s ~last ~start] is the last byte position of a
|
|
consecutive run of [char] in the range \[[start];[last]\] or
|
|
[start - 1] if [start] is not [char]. *)
|
|
|
|
val first_non_blank : string -> last:byte_pos -> start:byte_pos -> byte_pos
|
|
(** [first_non_blank s ~last ~start] is the first byte position in the
|
|
range \[[start];[last]\] that is not blank and [last + 1] if there
|
|
is none. *)
|
|
|
|
val first_non_blank_in_span : string -> line_span -> byte_pos
|
|
(** [first_non_blank_in_span s span] is
|
|
[first_non_blank s ~last:span.last ~start:span.first]. *)
|
|
|
|
val last_non_blank : string -> first:byte_pos -> start:byte_pos -> byte_pos
|
|
(** [last_non_blank s ~first ~start] is the last position in the
|
|
range \[[first];[start]\] that is non blank and [first - 1] if
|
|
there is none. *)
|
|
|
|
val rev_drop_spaces : string -> first:byte_pos -> start:byte_pos -> byte_pos
|
|
(** [rev_drop_spaces s ~first ~start] is the last position in the
|
|
range \[[first];[start]\] that is not U+0020 and [first - 1] if
|
|
there is none. *)
|
|
|
|
val first_non_blank_over_nl :
|
|
next_line: 'a next_line -> string -> 'a -> line:line_span -> start:int ->
|
|
[ `None
|
|
| `This_line of byte_pos
|
|
| `Next_line of 'a * line_span * byte_pos ]
|
|
(** [first_non_blank_over_nl ~next_line s ~line ~start] is the first
|
|
byte position starting with [start] that is not blank in [line] or
|
|
on the next line as determined by [next_line]. Returns [`None] if
|
|
there is no such position. *)
|
|
|
|
val first_non_escaped_char :
|
|
char -> string -> last:byte_pos -> start:byte_pos -> byte_pos
|
|
(** [first_non_escaped_char c s ~last ~start] is the first byte position
|
|
in the range \[[start];[last]\] that has [c] unescaped and [last + 1]
|
|
if there is none. *)
|
|
|
|
(** {1:autolinks Autolinks} *)
|
|
|
|
val autolink_email : string -> last:byte_pos -> start:byte_pos -> last option
|
|
(** [autolink_email s ~last ~start] matches an email autolink starting at
|
|
[start] in the range \[[start];[last]\] (assumed on the same line). *)
|
|
|
|
val autolink_uri : string -> last:byte_pos -> start:byte_pos -> last option
|
|
(** [autolink_uri s ~last ~start] matches an URI autolink starting at
|
|
[start] in the range \[[start];[last]\] (assumed on the same line). *)
|
|
|
|
(** {1:raw_html Raw HTML} *)
|
|
|
|
val raw_html :
|
|
next_line:'a next_line -> string -> 'a -> line:line_span -> start:byte_pos ->
|
|
('a * line_span * rev_spans * last) option
|
|
(** [raw_html ~next_line s lines ~line ~start] matches raw HTML on
|
|
line [line] starting at [start]. [next_line] is used to get new
|
|
lines on [lines]. Returns [Some (lines, last_line, spans,
|
|
last_byte)] with [lines] the lines after consuming the raw HTML,
|
|
[last_line] the line where it stops [spans] the byte ranges of [s]
|
|
that make up the raw HTML in reverse order and [last_byte] the
|
|
last byte included in it (guaranteed to be on [last_line]). *)
|
|
|
|
(** {1:link Links} *)
|
|
|
|
val link_destination :
|
|
string -> last:byte_pos -> start:byte_pos -> (bool * first * last) option
|
|
(** [link_destination s ~last ~start] matches a link destination
|
|
starting at [start] in the range \[[start];[last]\] (assumed on
|
|
the same line). This is [Some (delimited, first, last)] with the
|
|
data in \[[first];[last]\] the destination data. [delimited] is
|
|
[true] if [first-1] is '<' and [last + 1] is '>'. *)
|
|
|
|
val link_title :
|
|
next_line:'a next_line -> string -> 'a -> line:line_span -> start:byte_pos ->
|
|
('a * line_span * rev_spans * last) option
|
|
(** [link_title ~next_line s lines ~line ~last] is a link title on line [line]
|
|
starting at [start]. Returns [Some (lines, last_line, spans, last)] with
|
|
[lines] the lines after consuming the title, [last_line] the line where
|
|
it stops, [spans] the byte ranges of [s] that make up the title in reverse
|
|
order, [last] is on the closing delimiter and guaranteed to be on
|
|
[last_line]. *)
|
|
|
|
val link_label :
|
|
Buffer.t -> next_line:'a next_line -> string -> 'a -> line:line_span ->
|
|
start:byte_pos -> ('a * line_span * rev_spans * last * string) option
|
|
(** [link_label buf ~next_line s lines ~line ~start] matches a link label
|
|
on line [line] starting at [start]. The byte ranges have the label's
|
|
content, the string is the normalized label. [buf] is used as scratch
|
|
space. *)
|
|
|
|
(** {1:leaf_block Leaf blocks}
|
|
|
|
Unless otherwise noted [start] is always after leading blanks. *)
|
|
|
|
type html_block_end_cond =
|
|
[ `End_str of string | `End_cond_1 | `End_blank | `End_blank_7 ]
|
|
(** The type for HTML block end conditions. *)
|
|
|
|
type line_type =
|
|
| Atx_heading_line of heading_level * byte_pos (* after # *) * first * last
|
|
| Blank_line
|
|
| Block_quote_line
|
|
| Fenced_code_block_line of first * last * (first * last) option
|
|
| Html_block_line of html_block_end_cond
|
|
| Indented_code_block_line
|
|
| List_marker_line of ([ `Ordered of int * char | `Unordered of char ] * last)
|
|
| Paragraph_line
|
|
| Setext_underline_line of heading_level * last
|
|
| Thematic_break_line of last
|
|
| Ext_table_row of last
|
|
| Ext_footnote_label of rev_spans * last * string
|
|
| Nomatch (* built-in [None] to avoid option allocs *)
|
|
|
|
val thematic_break : string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [thematic_break s ~last ~start] matches a thematic break in the range
|
|
in the range \[[start];[last]\]. The returned position is the last
|
|
non-blank. *)
|
|
|
|
val atx_heading :
|
|
string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [atx_heading s ~first ~last] is an ATX heading in the range
|
|
\[[start];[last]\]. *)
|
|
|
|
val setext_heading_underline :
|
|
string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [setext_heading_underline s ~last ~start] is a setext heading
|
|
underline in the range \[[start];[last]\]. The returned position
|
|
is the last underline char. *)
|
|
|
|
val fenced_code_block_start :
|
|
string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [fenced_code_block_start s ~last ~start] is the start of a fenced
|
|
code block line in the range \[[start];[last]\]. The first span is
|
|
the fence and the second one is the info string (if any). *)
|
|
|
|
val fenced_code_block_continue :
|
|
fence:char * int -> string -> last:byte_pos -> start:byte_pos ->
|
|
[ `Close of first * last | `Code ]
|
|
(** [fenced_code_block_continue ~fence s ~last ~start] indicates
|
|
whether the fence code continues or closes in the the range
|
|
\[[start];[last]\] given the opening [open] which indicates the
|
|
indent, fence char and number of fence chars. *)
|
|
|
|
val html_block_start :
|
|
string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [html_block_start s ~last ~start] matches the start of an HTML
|
|
block starting at [start] in the range \[[start];[last]\] and on
|
|
success returns the condition to end it. *)
|
|
|
|
val html_block_end :
|
|
end_cond:html_block_end_cond -> string -> last:byte_pos -> start:byte_pos ->
|
|
bool
|
|
(** [html_block ~end_code s ~last ~start] is [true] if the HTML block
|
|
end with [end_code] in the the range \[[start];[last]\] *)
|
|
|
|
val ext_table_row : string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [ext_table s ~last ~start] matches a table row in the range
|
|
\[[start];[last]\]. The returned position is the rightmost [|]. *)
|
|
|
|
val ext_footnote_label :
|
|
Buffer.t -> string -> line_pos:Textloc.line_pos -> last:byte_pos ->
|
|
start:byte_pos -> line_type
|
|
(** [ext_footnote_label s ~last ~start] matches a footnote label the range
|
|
\[[start];[last]\]. The returned position is the rightmost [:].
|
|
This remains on the same line. *)
|
|
|
|
val could_be_link_reference_definition :
|
|
string -> last:byte_pos -> start:byte_pos -> bool
|
|
(** [could_be_link_reference_definition s ~last ~start] is [true] if
|
|
in the range \[[start];[last]\] could hold a link reference definition. *)
|
|
|
|
(** {1:container Container blocks} *)
|
|
|
|
val list_marker :
|
|
string -> last:byte_pos -> start:byte_pos -> line_type
|
|
(** [list_marker s ~last ~start] is a list marker in the range
|
|
\[[start];[last]\]. This checks there's at least one space
|
|
following unless the item is empty. *)
|
|
|
|
val ext_task_marker :
|
|
string -> last:byte_pos -> start:byte_pos -> (Uchar.t * last) option
|
|
(** [ext_task_marker s ~last ~start] is a list task item marker in the
|
|
range \[[start];[last]\]. *)
|
|
|
|
(*---------------------------------------------------------------------------
|
|
Copyright (c) 2021 The cmarkit programmers
|
|
|
|
Permission to use, copy, modify, and/or distribute this software for any
|
|
purpose with or without fee is hereby granted, provided that the above
|
|
copyright notice and this permission notice appear in all copies.
|
|
|
|
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
|
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
|
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
|
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
---------------------------------------------------------------------------*)
|