From 30fca7ae9ee7dfefc08954c77ae42986cf623acd Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Tue, 6 Mar 2018 22:15:28 -0600 Subject: [PATCH] add `CCUtf8_string` with basic encoding and decoding functionalities --- containers.opam | 2 + qtest/jbuild | 2 +- src/core/CCUtf8_string.ml | 288 +++++++++++++++++++++++++++++++++++++ src/core/CCUtf8_string.mli | 75 ++++++++++ src/core/containers.ml | 1 + src/core/jbuild | 2 +- 6 files changed, 368 insertions(+), 2 deletions(-) create mode 100644 src/core/CCUtf8_string.ml create mode 100644 src/core/CCUtf8_string.mli diff --git a/containers.opam b/containers.opam index 4b6b83ef..612c6187 100644 --- a/containers.opam +++ b/containers.opam @@ -11,11 +11,13 @@ build-test: [ "jbuilder" "runtest" "-p" name "-j" jobs] depends: [ "jbuilder" {build & >= "1.0+beta12"} "result" + "uchar" "qtest" { test } "qcheck" { test } "ounit" { test } "sequence" { test } "gen" { test } + "uutf" { test } "odoc" { doc } ] depopts: [ diff --git a/qtest/jbuild b/qtest/jbuild index 8f48e686..05351c29 100644 --- a/qtest/jbuild +++ b/qtest/jbuild @@ -18,7 +18,7 @@ (modules (run_qtest)) (libraries (sequence gen qcheck containers containers.unix containers.data containers.thread containers.iter - containers.sexp)) + containers.sexp uutf)) )) (alias diff --git a/src/core/CCUtf8_string.ml b/src/core/CCUtf8_string.ml new file mode 100644 index 00000000..b21524ef --- /dev/null +++ b/src/core/CCUtf8_string.ml @@ -0,0 +1,288 @@ + +(** {1 UTF8 strings} *) + +(** Ref {{: https://en.wikipedia.org/wiki/UTF-8} Wikipedia} + + We only deal with UTF8 strings as they naturally map to OCaml bytestrings *) + +type uchar = Uchar.t +type 'a gen = unit -> 'a option +type 'a sequence = ('a -> unit) -> unit + +type t = string + +let to_string x = x + +let pp = Format.pp_print_string +let equal = String.equal +let compare = String.compare +let hash : t -> int = Hashtbl.hash + +(** State for decoding *) +module Dec = struct + type t = { + s: string; + len: int; (* max offset *) + mutable i: int; (* offset *) + } + + let make ?(idx=0) (s:string) : t = + { s=s; i=idx; len=String.length s; } +end + +exception Malformed of string * int +(** Malformed string at given offset *) + +(* decode next char. Mutate state, calls [yield c] if a char [c] is + read, [stop ()] otherwise. + @raise Malformed if an invalid substring is met *) +let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a = + let open Dec in + (* read a multi-byte character. + @param acc the accumulator (containing the first byte of the char) + @param n_bytes number of bytes to read (i.e. [width char - 1]) *) + let read_multi n_bytes acc = + (* inner loop j = 1..jmax *) + let rec aux j acc = + let c = Char.code st.s.[ st.i + j] in + (* check that c is in 0b10xxxxxx *) + if c lsr 6 <> 0b10 then raise (Malformed (st.s,st.i)); + (* except for first, each char gives 6 bits *) + let next = (acc lsl 6) lor (c land 0b111111) in + if j = n_bytes then ( + (* done reading the codepoint *) + if Uchar.is_valid next then ( + st.i <- st.i + j + 1; (* +1 for first char *) + yield (Uchar.unsafe_of_int next) + ) else ( + raise (Malformed (st.s,st.i)) + ) + ) else ( + aux (j+1) next + ) + in + assert (n_bytes >= 1); + (* is the string long enough to contain the whole codepoint? *) + if st.i + n_bytes < st.len then ( + aux 1 acc (* start with j=1, first char is already proccessed! *) + ) else ( + (* char is truncated *) + raise (Malformed (st.s,st.i)) + ) + in + if st.i >= st.len then ( + stop () + ) else ( + let c = st.s.[ st.i ] in + match c with + | '\000' .. '\127' -> + st.i <- 1 + st.i; + yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *) + | '\192' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *) + | '\224' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *) + | '\240' .. '\247' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *) + | '\128' .. '\191' + | '\248' .. '\255' -> + raise (Malformed (st.s,st.i)) + ) + +let to_gen ?(idx=0) str : uchar gen = + let st = Dec.make ~idx str in + fun () -> + next_ st + ~yield:(fun c -> Some c) + ~stop:(fun () -> None) + () + +exception Stop + +let to_seq ?(idx=0) s : uchar sequence = + fun yield -> + let st = Dec.make ~idx s in + try + while true do + next_ st ~yield + ~stop:(fun () -> raise Stop) + () + done + with Stop -> () + +let iter ?idx f s = to_seq ?idx s f + +let fold ?idx f acc s = + let st = Dec.make ?idx s in + let rec aux acc = + next_ st + ~yield:(fun x -> + let acc = f acc x in + aux acc) + ~stop:(fun () -> acc) + () + in + aux acc + +let to_list ?(idx=0) s : uchar list = + fold ~idx (fun acc x -> x :: acc) [] s |> List.rev + +(* Convert a code point (int) into a string; + There are various equally trivial versions of this around. +*) + +let code_to_string buf (c:uchar) : unit = + let c = Uchar.to_int c in + let mask = 0b111111 in + assert (Uchar.is_valid c); + if c <= 0x7f then ( + Buffer.add_char buf (Char.unsafe_chr c) + ) else if c <= 0x7ff then ( + Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (c lsr 6))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + ) else if c <= 0xffff then ( + Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (c lsr 12))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + ) else if c <= 0x1fffff then ( + Buffer.add_char buf (Char.unsafe_chr (0xf0 lor (c lsr 18))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + ) else ( + Buffer.add_char buf (Char.unsafe_chr (0xf8 lor (c lsr 24))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + ) + +let of_gen g : t = + let buf = Buffer.create 32 in + let rec aux () = match g() with + | None -> Buffer.contents buf + | Some c -> code_to_string buf c; aux () + in + aux () + +let of_seq seq : t = + let buf = Buffer.create 32 in + seq (code_to_string buf); + Buffer.contents buf + +let of_list l : t = + let buf = Buffer.create 32 in + List.iter (code_to_string buf) l; + Buffer.contents buf + +let unsafe_of_string s = s + +let is_valid (s:string) : bool = + try + let st = Dec.make s in + while true do + next_ st + ~yield:(fun _ -> ()) + ~stop:(fun () -> raise Stop) + () + done; + assert false + with + | Malformed _ -> false + | Stop -> true + +let of_string_exn s = + if is_valid s then s + else invalid_arg "CCUtf8_string.of_string_exn" + +let of_string s = if is_valid s then Some s else None + +(*$inject + + let printer s = String.escaped (to_string s) + let pp_uchar (c:Uchar.t) = Printf.sprintf "0x%x" (Uchar.to_int c) + +let uutf_is_valid s = + try + Uutf.String.fold_utf_8 + (fun () _ -> function + | `Malformed _ -> raise Exit + | `Uchar _ -> ()) + () s; + true + with Exit -> + false + +let uutf_to_seq s f = + Uutf.String.fold_utf_8 + (fun () _ -> function + | `Malformed _ -> f (Uchar.of_int 0xfffd) + | `Uchar c -> f c) + () s +*) + +(*$R + let s = of_string_exn "このため、" in + let s' = to_seq s |> of_seq in + assert_equal ~cmp:equal ~printer s s' +*) + +(*$QR + Q.string (fun s -> + Q.assume (CCString.for_all (fun c -> Char.code c < 128) s); + is_valid s) +*) + +(*$QR + Q.string (fun s -> + Q.assume (CCString.for_all (fun c -> Char.code c < 128) s); + s = (of_string_exn s |> to_seq |> of_seq |> to_string) + ) +*) + +(*$QR + Q.string (fun s -> + Q.assume (CCString.for_all (fun c -> Char.code c < 128) s); + String.length s = List.length (of_string_exn s |> to_list) + ) +*) + +(*$QR + Q.string (fun s -> + Q.assume (is_valid s); + let s = of_string_exn s in + let s2 = s |> to_seq |> of_seq in + if s=s2 then true + else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2) + ) +*) + +(*$QR + Q.string (fun s -> + Q.assume (is_valid s); + let s = of_string_exn s in + let s2 = s |> to_gen |> of_gen in + if s=s2 then true + else Q.Test.fail_reportf "s=%S, s2=%S" (to_string s)(to_string s2) + ) +*) + +(* compare with uutf *) + +(*$QR + Q.string (fun s -> + let v1 = is_valid s in + let v2 = uutf_is_valid s in + if v1=v2 then true + else Q.Test.fail_reportf "s:%S, valid: %B, uutf_valid: %B" s v1 v2 + ) +*) + +(*$QR + Q.string (fun s -> + Q.assume (is_valid s && uutf_is_valid s); + let pp s = Q.Print.(list pp_uchar) s in + let l_uutf = uutf_to_seq s |> Sequence.to_list in + let l_co = of_string_exn s |> to_seq |> Sequence.to_list in + if l_uutf = l_co then true + else Q.Test.fail_reportf "uutf: '%s', containers: '%s', is_valid %B, uutf_is_valid %B" + (pp l_uutf) (pp l_co) (is_valid s) (uutf_is_valid s) + ) + *) diff --git a/src/core/CCUtf8_string.mli b/src/core/CCUtf8_string.mli new file mode 100644 index 00000000..64843c18 --- /dev/null +++ b/src/core/CCUtf8_string.mli @@ -0,0 +1,75 @@ + +(** {1 Unicode String, in UTF8} *) + +(** A unicode string represented by a utf8 bytestring. This representation + is convenient for manipulating normal OCaml strings that are encoded + in UTF8. + + We perform only basic decoding and encoding between codepoints and + bytestrings. + For more elaborate operations, + please use the excellent {{: http://erratique.ch/software/uutf} Uutf}. + + @since NEXT_RELEASE + + {b status}: experimental +*) + + +type uchar = Uchar.t +type 'a gen = unit -> 'a option +type 'a sequence = ('a -> unit) -> unit + +type t = private string +(** A UTF8 string *) + +val equal : t -> t -> bool + +val hash : t -> int + +val compare : t -> t -> int + +val pp : Format.formatter -> t -> unit + +val to_string : t -> string +(** Identity *) + +exception Malformed of string * int +(** Malformed string at given offset *) + +val to_gen : ?idx:int -> t -> uchar gen +(** Generator of unicode codepoints. + @param idx offset where to start the decoding *) + +val to_seq : ?idx:int -> t -> uchar sequence +(** Sequence of unicode codepoints. + @param idx offset where to start the decoding *) + +val to_list : ?idx:int -> t -> uchar list +(** List of unicode codepoints. + @param idx offset where to start the decoding *) + +val fold : ?idx:int -> ('a -> uchar -> 'a) -> 'a -> t -> 'a + +val iter : ?idx:int -> (uchar -> unit) -> t -> unit + +val of_seq : uchar sequence -> t + +val of_gen : uchar gen -> t + +val of_list : uchar list -> t + +val of_string_exn : string -> t +(** Validate string by checking it is valid UTF8 + @raise Invalid_argument if the string is not valid UTF8 *) + +val of_string : string -> t option +(** Safe version of {!of_string_exn} *) + +val is_valid : string -> bool +(** Valid UTF8? *) + +val unsafe_of_string : string -> t +(** Conversion from a string without validating. + Upon iteration, if an invalid substring is met, Malformed will be raised *) + diff --git a/src/core/containers.ml b/src/core/containers.ml index 15945033..b12a4a2c 100644 --- a/src/core/containers.ml +++ b/src/core/containers.ml @@ -45,5 +45,6 @@ module Set = CCSet module String = CCString module Vector = CCVector module Monomorphic = CCMonomorphic +module Utf8_string = CCUtf8_string include Monomorphic diff --git a/src/core/jbuild b/src/core/jbuild index 172c8daf..2c848845 100644 --- a/src/core/jbuild +++ b/src/core/jbuild @@ -5,5 +5,5 @@ (wrapped false) (flags (:standard -w +a-4-42-44-48-50-58-32-60@8 -safe-string -nolabels -open CCMonomorphic)) (ocamlopt_flags (:standard (:include ../flambda.flags))) - (libraries (bytes result containers.monomorphic)) + (libraries (bytes result uchar containers.monomorphic)) ))