mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 03:05:28 -05:00
287 lines
7.6 KiB
OCaml
287 lines
7.6 KiB
OCaml
|
|
(** {1 UTF8 strings} *)
|
|
|
|
(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}
|
|
|
|
We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)
|
|
|
|
type uchar = Uchar.t
|
|
type 'a gen = unit -> 'a option
|
|
type 'a sequence = ('a -> unit) -> unit
|
|
|
|
let equal (a:string) b = Pervasives.(=) a b
|
|
let hash : string -> int = Hashtbl.hash
|
|
let pp = Format.pp_print_string
|
|
|
|
include String
|
|
|
|
let to_string x = x
|
|
|
|
(** State for decoding *)
|
|
module Dec = struct
|
|
type t = {
|
|
s: string;
|
|
len: int; (* max offset *)
|
|
mutable i: int; (* offset *)
|
|
}
|
|
|
|
let make ?(idx=0) (s:string) : t =
|
|
{ s=s; i=idx; len=String.length s; }
|
|
end
|
|
|
|
exception Malformed of string * int
|
|
(** Malformed string at given offset *)
|
|
|
|
(* decode next char. Mutate state, calls [yield c] if a char [c] is
|
|
read, [stop ()] otherwise.
|
|
@raise Malformed if an invalid substring is met *)
|
|
let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
|
let open Dec in
|
|
(* read a multi-byte character.
|
|
@param acc the accumulator (containing the first byte of the char)
|
|
@param n_bytes number of bytes to read (i.e. [width char - 1]) *)
|
|
let read_multi n_bytes acc =
|
|
(* inner loop j = 1..jmax *)
|
|
let rec aux j acc =
|
|
let c = Char.code st.s.[ st.i + j] in
|
|
(* check that c is in 0b10xxxxxx *)
|
|
if c lsr 6 <> 0b10 then raise (Malformed (st.s,st.i));
|
|
(* except for first, each char gives 6 bits *)
|
|
let next = (acc lsl 6) lor (c land 0b111111) in
|
|
if j = n_bytes then (
|
|
(* done reading the codepoint *)
|
|
if Uchar.is_valid next then (
|
|
st.i <- st.i + j + 1; (* +1 for first char *)
|
|
yield (Uchar.unsafe_of_int next)
|
|
) else (
|
|
raise (Malformed (st.s,st.i))
|
|
)
|
|
) else (
|
|
aux (j+1) next
|
|
)
|
|
in
|
|
assert (n_bytes >= 1);
|
|
(* is the string long enough to contain the whole codepoint? *)
|
|
if st.i + n_bytes < st.len then (
|
|
aux 1 acc (* start with j=1, first char is already proccessed! *)
|
|
) else (
|
|
(* char is truncated *)
|
|
raise (Malformed (st.s,st.i))
|
|
)
|
|
in
|
|
if st.i >= st.len then (
|
|
stop ()
|
|
) else (
|
|
let c = st.s.[ st.i ] in
|
|
match c with
|
|
| '\000' .. '\127' ->
|
|
st.i <- 1 + st.i;
|
|
yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *)
|
|
| '\192' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *)
|
|
| '\224' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *)
|
|
| '\240' .. '\247' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *)
|
|
| '\128' .. '\191'
|
|
| '\248' .. '\255' ->
|
|
raise (Malformed (st.s,st.i))
|
|
)
|
|
|
|
let to_gen ?(idx=0) str : uchar gen =
|
|
let st = Dec.make ~idx str in
|
|
fun () ->
|
|
next_ st
|
|
~yield:(fun c -> Some c)
|
|
~stop:(fun () -> None)
|
|
()
|
|
|
|
exception Stop
|
|
|
|
let to_seq ?(idx=0) s : uchar sequence =
|
|
fun yield ->
|
|
let st = Dec.make ~idx s in
|
|
try
|
|
while true do
|
|
next_ st ~yield
|
|
~stop:(fun () -> raise Stop)
|
|
()
|
|
done
|
|
with Stop -> ()
|
|
|
|
let iter ?idx f s = to_seq ?idx s f
|
|
|
|
let fold ?idx f acc s =
|
|
let st = Dec.make ?idx s in
|
|
let rec aux acc =
|
|
next_ st
|
|
~yield:(fun x ->
|
|
let acc = f acc x in
|
|
aux acc)
|
|
~stop:(fun () -> acc)
|
|
()
|
|
in
|
|
aux acc
|
|
|
|
let to_list ?(idx=0) s : uchar list =
|
|
fold ~idx (fun acc x -> x :: acc) [] s |> List.rev
|
|
|
|
(* Convert a code point (int) into a string;
|
|
There are various equally trivial versions of this around.
|
|
*)
|
|
|
|
let code_to_string buf (c:uchar) : unit =
|
|
let c = Uchar.to_int c in
|
|
let mask = 0b111111 in
|
|
assert (Uchar.is_valid c);
|
|
if c <= 0x7f then (
|
|
Buffer.add_char buf (Char.unsafe_chr c)
|
|
) else if c <= 0x7ff then (
|
|
Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (c lsr 6)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
|
|
) else if c <= 0xffff then (
|
|
Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (c lsr 12)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
|
|
) else if c <= 0x1fffff then (
|
|
Buffer.add_char buf (Char.unsafe_chr (0xf0 lor (c lsr 18)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
|
|
) else (
|
|
Buffer.add_char buf (Char.unsafe_chr (0xf8 lor (c lsr 24)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
|
|
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
|
|
)
|
|
|
|
let of_gen g : t =
|
|
let buf = Buffer.create 32 in
|
|
let rec aux () = match g() with
|
|
| None -> Buffer.contents buf
|
|
| Some c -> code_to_string buf c; aux ()
|
|
in
|
|
aux ()
|
|
|
|
let of_seq seq : t =
|
|
let buf = Buffer.create 32 in
|
|
seq (code_to_string buf);
|
|
Buffer.contents buf
|
|
|
|
let of_list l : t =
|
|
let buf = Buffer.create 32 in
|
|
List.iter (code_to_string buf) l;
|
|
Buffer.contents buf
|
|
|
|
let unsafe_of_string s = s
|
|
|
|
let is_valid (s:string) : bool =
|
|
try
|
|
let st = Dec.make s in
|
|
while true do
|
|
next_ st
|
|
~yield:(fun _ -> ())
|
|
~stop:(fun () -> raise Stop)
|
|
()
|
|
done;
|
|
assert false
|
|
with
|
|
| Malformed _ -> false
|
|
| Stop -> true
|
|
|
|
let of_string_exn s =
|
|
if is_valid s then s
|
|
else invalid_arg "CCUtf8_string.of_string_exn"
|
|
|
|
let of_string s = if is_valid s then Some s else None
|
|
|
|
(*$inject
|
|
|
|
let printer s = String.escaped (to_string s)
|
|
let pp_uchar (c:Uchar.t) = Printf.sprintf "0x%x" (Uchar.to_int c)
|
|
|
|
let uutf_is_valid s =
|
|
try
|
|
Uutf.String.fold_utf_8
|
|
(fun () _ -> function
|
|
| `Malformed _ -> raise Exit
|
|
| `Uchar _ -> ())
|
|
() s;
|
|
true
|
|
with Exit ->
|
|
false
|
|
|
|
let uutf_to_seq s f =
|
|
Uutf.String.fold_utf_8
|
|
(fun () _ -> function
|
|
| `Malformed _ -> f (Uchar.of_int 0xfffd)
|
|
| `Uchar c -> f c)
|
|
() s
|
|
*)
|
|
|
|
(*$R
|
|
let s = of_string_exn "このため、" in
|
|
let s' = to_seq s |> of_seq in
|
|
assert_equal ~cmp:equal ~printer s s'
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
|
|
is_valid s)
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
|
|
s = (of_string_exn s |> to_seq |> of_seq |> to_string)
|
|
)
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
|
|
String.length s = List.length (of_string_exn s |> to_list)
|
|
)
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (is_valid s);
|
|
let s = of_string_exn s in
|
|
let s2 = s |> to_seq |> of_seq in
|
|
if s=s2 then true
|
|
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
|
|
)
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (is_valid s);
|
|
let s = of_string_exn s in
|
|
let s2 = s |> to_gen |> of_gen in
|
|
if s=s2 then true
|
|
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
|
|
)
|
|
*)
|
|
|
|
(* compare with uutf *)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
let v1 = is_valid s in
|
|
let v2 = uutf_is_valid s in
|
|
if v1=v2 then true
|
|
else Q.Test.fail_reportf "s:%S, valid: %B, uutf_valid: %B" s v1 v2
|
|
)
|
|
*)
|
|
|
|
(*$QR
|
|
Q.string (fun s ->
|
|
Q.assume (is_valid s && uutf_is_valid s);
|
|
let pp s = Q.Print.(list pp_uchar) s in
|
|
let l_uutf = uutf_to_seq s |> Sequence.to_list in
|
|
let l_co = of_string_exn s |> to_seq |> Sequence.to_list in
|
|
if l_uutf = l_co then true
|
|
else Q.Test.fail_reportf "uutf: '%s', containers: '%s', is_valid %B, uutf_is_valid %B"
|
|
(pp l_uutf) (pp l_co) (is_valid s) (uutf_is_valid s)
|
|
)
|
|
*)
|