ocaml-containers/src/core/CCUtf8_string.ml

(** {1 UTF8 strings} *)

(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}

    We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)

type uchar = Uchar.t
type 'a gen = unit -> 'a option
type 'a iter = ('a -> unit) -> unit

(* compat shim *)

[@@@ocaml.warning "-32"]

let equal (a : string) b = Stdlib.( = ) a b
let hash : string -> int = Hashtbl.hash

[@@@ocaml.warning "+32"]

(* end compat shim *)

let pp = Format.pp_print_string

include String

let empty = ""
let to_string x = x

(** State for decoding *)
module Dec = struct
  type t = {
    s: string;
    len: int;
    (* max offset *) mutable i: int; (* offset *)
  }

  let make ?(idx = 0) (s : string) : t = { s; i = idx; len = String.length s }
end

let n_bytes = length

exception Malformed of string * int
(** Malformed string at given offset *)

(* decode next char. Mutate state, calls [yield c] if a char [c] is
   read, [stop ()] otherwise.
   @raise Malformed if an invalid substring is met *)
let next_ (type a) (st : Dec.t) ~(yield : uchar -> a) ~(stop : unit -> a) () : a
    =
  let open Dec in
  let malformed st = raise (Malformed (st.s, st.i)) in
  (* read a multi-byte character.
     @param acc the accumulator (containing the first byte of the char)
     @param n_bytes number of bytes to read (i.e. [width char - 1])
     @param overlong minimal bound on second byte (to detect overlong encoding)
  *)
  let read_multi ?(overlong = 0) n_bytes acc =
    (* inner loop j = 1..jmax *)
    let rec aux j acc =
      let c = Char.code st.s.[st.i + j] in
      (* check that c is in 0b10xxxxxx *)
      if c lsr 6 <> 0b10 then malformed st;
      (* overlong encoding? *)
      if j = 1 && overlong <> 0 && c land 0b111111 < overlong then malformed st;
      (* except for first, each char gives 6 bits *)
      let next = (acc lsl 6) lor (c land 0b111111) in
      if j = n_bytes then
        if (* done reading the codepoint *)
           Uchar.is_valid next then (
          st.i <- st.i + j + 1;
          (* +1 for first char *)
          yield (Uchar.unsafe_of_int next)
        ) else
          malformed st
      else
        aux (j + 1) next
    in
    assert (n_bytes >= 1);
    (* is the string long enough to contain the whole codepoint? *)
    if st.i + n_bytes < st.len then
      aux 1 acc
    (* start with j=1, first char is already processed! *)
    else
      (* char is truncated *)
      malformed st
  in
  if st.i >= st.len then
    stop ()
  else (
    let c = st.s.[st.i] in
    (* find leading byte, and detect some impossible cases
       according to https://en.wikipedia.org/wiki/Utf8#Codepage_layout *)
    match c with
    | '\000' .. '\127' ->
      st.i <- 1 + st.i;
      yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *)
    | '\194' .. '\223' -> read_multi 1 (Char.code c land 0b11111) (* 110yyyyy *)
    | '\225' .. '\239' -> read_multi 2 (Char.code c land 0b1111) (* 1110zzzz *)
    | '\241' .. '\244' -> read_multi 3 (Char.code c land 0b111) (* 11110uuu *)
    | '\224' ->
      (* overlong: if next byte is < than [0b001000000] then the char
         would fit in 1 byte *)
      read_multi ~overlong:0b00100000 2 (Char.code c land 0b1111)
      (* 1110zzzz *)
    | '\240' ->
      (* overlong: if next byte is < than [0b000100000] then the char
         would fit in 2 bytes *)
      read_multi ~overlong:0b00010000 3 (Char.code c land 0b111)
      (* 11110uuu *)
    | '\128' .. '\193' (* 192,193 are forbidden *) | '\245' .. '\255' ->
      malformed st
  )

let to_gen ?(idx = 0) str : uchar gen =
  let st = Dec.make ~idx str in
  fun () -> next_ st ~yield:(fun c -> Some c) ~stop:(fun () -> None) ()

exception Stop

let to_iter ?(idx = 0) s : uchar iter =
 fun yield ->
  let st = Dec.make ~idx s in
  try
    while true do
      next_ st ~yield ~stop:(fun () -> raise Stop) ()
    done
  with Stop -> ()

let to_seq ?(idx = 0) s : uchar Seq.t =
  let rec loop st =
    let r = ref None in
    fun () ->
      match !r with
      | Some c -> c
      | None ->
        let c =
          next_ st
            ~yield:(fun x -> Seq.Cons (x, loop st))
            ~stop:(fun () -> Seq.Nil)
            ()
        in
        r := Some c;
        c
  in
  let st = Dec.make ~idx s in
  loop st

let iter ?idx f s = to_iter ?idx s f

let fold ?idx f acc s =
  let st = Dec.make ?idx s in
  let rec aux acc =
    next_ st
      ~yield:(fun x ->
        let acc = f acc x in
        aux acc)
      ~stop:(fun () -> acc)
      ()
  in
  aux acc

let n_chars = fold (fun x _ -> x + 1) 0

let to_list ?(idx = 0) s : uchar list =
  fold ~idx (fun acc x -> x :: acc) [] s |> List.rev

(* Convert a code point (int) into a string;
   There are various equally trivial versions of this around.
*)

let[@inline] uchar_to_bytes (c : uchar) (f : char -> unit) : unit =
  let c = Uchar.to_int c in
  let mask = 0b111111 in
  assert (Uchar.is_valid c);
  if c <= 0x7f then
    f (Char.unsafe_chr c)
  else if c <= 0x7ff then (
    f (Char.unsafe_chr (0xc0 lor (c lsr 6)));
    f (Char.unsafe_chr (0x80 lor (c land mask)))
  ) else if c <= 0xffff then (
    f (Char.unsafe_chr (0xe0 lor (c lsr 12)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
    f (Char.unsafe_chr (0x80 lor (c land mask)))
  ) else if c <= 0x1fffff then (
    f (Char.unsafe_chr (0xf0 lor (c lsr 18)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
    f (Char.unsafe_chr (0x80 lor (c land mask)))
  ) else (
    f (Char.unsafe_chr (0xf8 lor (c lsr 24)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
    f (Char.unsafe_chr (0x80 lor (c land mask)))
  )

(* number of bytes required to encode this codepoint. A skeleton version
   of {!uchar_to_bytes}. *)
let[@inline] uchar_num_bytes (c : uchar) : int =
  let c = Uchar.to_int c in
  if c <= 0x7f then
    1
  else if c <= 0x7ff then
    2
  else if c <= 0xffff then
    3
  else if c <= 0x1fffff then
    4
  else
    5

let of_gen g : t =
  let buf = Buffer.create 32 in
  let rec aux () =
    match g () with
    | None -> Buffer.contents buf
    | Some c ->
      uchar_to_bytes c (Buffer.add_char buf);
      aux ()
  in
  aux ()

let of_seq seq : t =
  let buf = Buffer.create 32 in
  Seq.iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) seq;
  Buffer.contents buf

let of_iter i : t =
  let buf = Buffer.create 32 in
  i (fun c -> uchar_to_bytes c (Buffer.add_char buf));
  Buffer.contents buf

let make n c =
  if n = 0 then
    empty
  else (
    let n_bytes = uchar_num_bytes c in
    let buf = Bytes.create (n * n_bytes) in
    (* copy [c] at the beginning of the buffer *)
    let i = ref 0 in
    uchar_to_bytes c (fun b ->
        Bytes.set buf !i b;
        incr i);
    (* now repeat the prefix n-1 times *)
    for j = 1 to n - 1 do
      Bytes.blit buf 0 buf (n_bytes * j) n_bytes
    done;
    Bytes.unsafe_to_string buf
  )

let[@inline] of_uchar c : t = make 1 c

let of_list l : t =
  let len = List.fold_left (fun n c -> n + uchar_num_bytes c) 0 l in
  if len > Sys.max_string_length then
    invalid_arg "CCUtf8_string.of_list: string size limit exceeded";
  let buf = Bytes.make len '\000' in
  let i = ref 0 in
  List.iter
    (fun c ->
      uchar_to_bytes c (fun byte ->
          Bytes.unsafe_set buf !i byte;
          incr i))
    l;
  assert (!i = len);
  Bytes.unsafe_to_string buf

let map f s : t =
  let buf = Buffer.create (n_bytes s) in
  iter (fun c -> uchar_to_bytes (f c) (Buffer.add_char buf)) s;
  Buffer.contents buf

let filter_map f s : t =
  let buf = Buffer.create (n_bytes s) in
  iter
    (fun c ->
      match f c with
      | None -> ()
      | Some c -> uchar_to_bytes c (Buffer.add_char buf))
    s;
  Buffer.contents buf

let flat_map f s : t =
  let buf = Buffer.create (n_bytes s) in
  iter (fun c -> iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) (f c)) s;
  Buffer.contents buf

let append = Stdlib.( ^ )
let unsafe_of_string s = s

let is_valid (s : string) : bool =
  try
    let st = Dec.make s in
    while true do
      next_ st ~yield:(fun _ -> ()) ~stop:(fun () -> raise Stop) ()
    done;
    assert false
  with
  | Malformed _ -> false
  | Stop -> true

let of_string_exn s =
  if is_valid s then
    s
  else
    invalid_arg "CCUtf8_string.of_string_exn"

let of_string s =
  if is_valid s then
    Some s
  else
    None