add CCUtf8_string with basic encoding and decoding functionalities

This commit is contained in:
Simon Cruanes 2018-03-06 22:15:28 -06:00
parent 524658fb0f
commit 30fca7ae9e
6 changed files with 368 additions and 2 deletions

View file

@ -11,11 +11,13 @@ build-test: [ "jbuilder" "runtest" "-p" name "-j" jobs]
depends: [
"jbuilder" {build & >= "1.0+beta12"}
"result"
"uchar"
"qtest" { test }
"qcheck" { test }
"ounit" { test }
"sequence" { test }
"gen" { test }
"uutf" { test }
"odoc" { doc }
]
depopts: [

View file

@ -18,7 +18,7 @@
(modules (run_qtest))
(libraries (sequence gen qcheck containers containers.unix
containers.data containers.thread containers.iter
containers.sexp))
containers.sexp uutf))
))
(alias

288
src/core/CCUtf8_string.ml Normal file
View file

@ -0,0 +1,288 @@
(** {1 UTF8 strings} *)
(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia}
We only deal with UTF8 strings as they naturally map to OCaml bytestrings *)
type uchar = Uchar.t
type 'a gen = unit -> 'a option
type 'a sequence = ('a -> unit) -> unit
type t = string
let to_string x = x
let pp = Format.pp_print_string
let equal = String.equal
let compare = String.compare
let hash : t -> int = Hashtbl.hash
(** State for decoding *)
module Dec = struct
type t = {
s: string;
len: int; (* max offset *)
mutable i: int; (* offset *)
}
let make ?(idx=0) (s:string) : t =
{ s=s; i=idx; len=String.length s; }
end
exception Malformed of string * int
(** Malformed string at given offset *)
(* decode next char. Mutate state, calls [yield c] if a char [c] is
read, [stop ()] otherwise.
@raise Malformed if an invalid substring is met *)
let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
let open Dec in
(* read a multi-byte character.
@param acc the accumulator (containing the first byte of the char)
@param n_bytes number of bytes to read (i.e. [width char - 1]) *)
let read_multi n_bytes acc =
(* inner loop j = 1..jmax *)
let rec aux j acc =
let c = Char.code st.s.[ st.i + j] in
(* check that c is in 0b10xxxxxx *)
if c lsr 6 <> 0b10 then raise (Malformed (st.s,st.i));
(* except for first, each char gives 6 bits *)
let next = (acc lsl 6) lor (c land 0b111111) in
if j = n_bytes then (
(* done reading the codepoint *)
if Uchar.is_valid next then (
st.i <- st.i + j + 1; (* +1 for first char *)
yield (Uchar.unsafe_of_int next)
) else (
raise (Malformed (st.s,st.i))
)
) else (
aux (j+1) next
)
in
assert (n_bytes >= 1);
(* is the string long enough to contain the whole codepoint? *)
if st.i + n_bytes < st.len then (
aux 1 acc (* start with j=1, first char is already proccessed! *)
) else (
(* char is truncated *)
raise (Malformed (st.s,st.i))
)
in
if st.i >= st.len then (
stop ()
) else (
let c = st.s.[ st.i ] in
match c with
| '\000' .. '\127' ->
st.i <- 1 + st.i;
yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *)
| '\192' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *)
| '\224' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *)
| '\240' .. '\247' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *)
| '\128' .. '\191'
| '\248' .. '\255' ->
raise (Malformed (st.s,st.i))
)
let to_gen ?(idx=0) str : uchar gen =
let st = Dec.make ~idx str in
fun () ->
next_ st
~yield:(fun c -> Some c)
~stop:(fun () -> None)
()
exception Stop
let to_seq ?(idx=0) s : uchar sequence =
fun yield ->
let st = Dec.make ~idx s in
try
while true do
next_ st ~yield
~stop:(fun () -> raise Stop)
()
done
with Stop -> ()
let iter ?idx f s = to_seq ?idx s f
let fold ?idx f acc s =
let st = Dec.make ?idx s in
let rec aux acc =
next_ st
~yield:(fun x ->
let acc = f acc x in
aux acc)
~stop:(fun () -> acc)
()
in
aux acc
let to_list ?(idx=0) s : uchar list =
fold ~idx (fun acc x -> x :: acc) [] s |> List.rev
(* Convert a code point (int) into a string;
There are various equally trivial versions of this around.
*)
let code_to_string buf (c:uchar) : unit =
let c = Uchar.to_int c in
let mask = 0b111111 in
assert (Uchar.is_valid c);
if c <= 0x7f then (
Buffer.add_char buf (Char.unsafe_chr c)
) else if c <= 0x7ff then (
Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (c lsr 6)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
) else if c <= 0xffff then (
Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (c lsr 12)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
) else if c <= 0x1fffff then (
Buffer.add_char buf (Char.unsafe_chr (0xf0 lor (c lsr 18)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
) else (
Buffer.add_char buf (Char.unsafe_chr (0xf8 lor (c lsr 24)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
)
let of_gen g : t =
let buf = Buffer.create 32 in
let rec aux () = match g() with
| None -> Buffer.contents buf
| Some c -> code_to_string buf c; aux ()
in
aux ()
let of_seq seq : t =
let buf = Buffer.create 32 in
seq (code_to_string buf);
Buffer.contents buf
let of_list l : t =
let buf = Buffer.create 32 in
List.iter (code_to_string buf) l;
Buffer.contents buf
let unsafe_of_string s = s
let is_valid (s:string) : bool =
try
let st = Dec.make s in
while true do
next_ st
~yield:(fun _ -> ())
~stop:(fun () -> raise Stop)
()
done;
assert false
with
| Malformed _ -> false
| Stop -> true
let of_string_exn s =
if is_valid s then s
else invalid_arg "CCUtf8_string.of_string_exn"
let of_string s = if is_valid s then Some s else None
(*$inject
let printer s = String.escaped (to_string s)
let pp_uchar (c:Uchar.t) = Printf.sprintf "0x%x" (Uchar.to_int c)
let uutf_is_valid s =
try
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> raise Exit
| `Uchar _ -> ())
() s;
true
with Exit ->
false
let uutf_to_seq s f =
Uutf.String.fold_utf_8
(fun () _ -> function
| `Malformed _ -> f (Uchar.of_int 0xfffd)
| `Uchar c -> f c)
() s
*)
(*$R
let s = of_string_exn "このため、" in
let s' = to_seq s |> of_seq in
assert_equal ~cmp:equal ~printer s s'
*)
(*$QR
Q.string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
is_valid s)
*)
(*$QR
Q.string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
s = (of_string_exn s |> to_seq |> of_seq |> to_string)
)
*)
(*$QR
Q.string (fun s ->
Q.assume (CCString.for_all (fun c -> Char.code c < 128) s);
String.length s = List.length (of_string_exn s |> to_list)
)
*)
(*$QR
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_seq |> of_seq in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)
(*$QR
Q.string (fun s ->
Q.assume (is_valid s);
let s = of_string_exn s in
let s2 = s |> to_gen |> of_gen in
if s=s2 then true
else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2)
)
*)
(* compare with uutf *)
(*$QR
Q.string (fun s ->
let v1 = is_valid s in
let v2 = uutf_is_valid s in
if v1=v2 then true
else Q.Test.fail_reportf "s:%S, valid: %B, uutf_valid: %B" s v1 v2
)
*)
(*$QR
Q.string (fun s ->
Q.assume (is_valid s && uutf_is_valid s);
let pp s = Q.Print.(list pp_uchar) s in
let l_uutf = uutf_to_seq s |> Sequence.to_list in
let l_co = of_string_exn s |> to_seq |> Sequence.to_list in
if l_uutf = l_co then true
else Q.Test.fail_reportf "uutf: '%s', containers: '%s', is_valid %B, uutf_is_valid %B"
(pp l_uutf) (pp l_co) (is_valid s) (uutf_is_valid s)
)
*)

View file

@ -0,0 +1,75 @@
(** {1 Unicode String, in UTF8} *)
(** A unicode string represented by a utf8 bytestring. This representation
is convenient for manipulating normal OCaml strings that are encoded
in UTF8.
We perform only basic decoding and encoding between codepoints and
bytestrings.
For more elaborate operations,
please use the excellent {{: http://erratique.ch/software/uutf} Uutf}.
@since NEXT_RELEASE
{b status}: experimental
*)
type uchar = Uchar.t
type 'a gen = unit -> 'a option
type 'a sequence = ('a -> unit) -> unit
type t = private string
(** A UTF8 string *)
val equal : t -> t -> bool
val hash : t -> int
val compare : t -> t -> int
val pp : Format.formatter -> t -> unit
val to_string : t -> string
(** Identity *)
exception Malformed of string * int
(** Malformed string at given offset *)
val to_gen : ?idx:int -> t -> uchar gen
(** Generator of unicode codepoints.
@param idx offset where to start the decoding *)
val to_seq : ?idx:int -> t -> uchar sequence
(** Sequence of unicode codepoints.
@param idx offset where to start the decoding *)
val to_list : ?idx:int -> t -> uchar list
(** List of unicode codepoints.
@param idx offset where to start the decoding *)
val fold : ?idx:int -> ('a -> uchar -> 'a) -> 'a -> t -> 'a
val iter : ?idx:int -> (uchar -> unit) -> t -> unit
val of_seq : uchar sequence -> t
val of_gen : uchar gen -> t
val of_list : uchar list -> t
val of_string_exn : string -> t
(** Validate string by checking it is valid UTF8
@raise Invalid_argument if the string is not valid UTF8 *)
val of_string : string -> t option
(** Safe version of {!of_string_exn} *)
val is_valid : string -> bool
(** Valid UTF8? *)
val unsafe_of_string : string -> t
(** Conversion from a string without validating.
Upon iteration, if an invalid substring is met, Malformed will be raised *)

View file

@ -45,5 +45,6 @@ module Set = CCSet
module String = CCString
module Vector = CCVector
module Monomorphic = CCMonomorphic
module Utf8_string = CCUtf8_string
include Monomorphic

View file

@ -5,5 +5,5 @@
(wrapped false)
(flags (:standard -w +a-4-42-44-48-50-58-32-60@8 -safe-string -nolabels -open CCMonomorphic))
(ocamlopt_flags (:standard (:include ../flambda.flags)))
(libraries (bytes result containers.monomorphic))
(libraries (bytes result uchar containers.monomorphic))
))