From 3918ed11554c63608e3e30ce601f7773b7fba766 Mon Sep 17 00:00:00 2001 From: Simon Cruanes Date: Mon, 7 Dec 2020 23:31:05 -0500 Subject: [PATCH] feat(utf8): add and expose `uchar_to_bytes` rather than encoding to buffers directly, we can expose an iterator over the bytes of an uchar. --- src/core/CCUtf8_string.ml | 80 ++++++++++++++++++++++++++------------ src/core/CCUtf8_string.mli | 7 ++++ 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/src/core/CCUtf8_string.ml b/src/core/CCUtf8_string.ml index 4811098e..c462fda6 100644 --- a/src/core/CCUtf8_string.ml +++ b/src/core/CCUtf8_string.ml @@ -179,58 +179,86 @@ let to_list ?(idx=0) s : uchar list = There are various equally trivial versions of this around. *) -let code_to_string buf (c:uchar) : unit = +let[@inline] uchar_to_bytes (c:uchar) (f:char -> unit) : unit = let c = Uchar.to_int c in let mask = 0b111111 in assert (Uchar.is_valid c); if c <= 0x7f then ( - Buffer.add_char buf (Char.unsafe_chr c) + f (Char.unsafe_chr c) ) else if c <= 0x7ff then ( - Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (c lsr 6))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + f (Char.unsafe_chr (0xc0 lor (c lsr 6))); + f (Char.unsafe_chr (0x80 lor (c land mask))); ) else if c <= 0xffff then ( - Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (c lsr 12))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + f (Char.unsafe_chr (0xe0 lor (c lsr 12))); + f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + f (Char.unsafe_chr (0x80 lor (c land mask))); ) else if c <= 0x1fffff then ( - Buffer.add_char buf (Char.unsafe_chr (0xf0 lor (c lsr 18))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + f (Char.unsafe_chr (0xf0 lor (c lsr 18))); + f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); + f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + f (Char.unsafe_chr (0x80 lor (c land mask))); ) else ( - Buffer.add_char buf (Char.unsafe_chr (0xf8 lor (c lsr 24))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); - Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask))); + f (Char.unsafe_chr (0xf8 lor (c lsr 24))); + f (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask))); + f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask))); + f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask))); + f (Char.unsafe_chr (0x80 lor (c land mask))); + ) + +(* number of bytes required to encode this codepoint. A skeleton version + of {!uchar_to_bytes}. *) +let[@inline] uchar_num_bytes (c:uchar) : int = + let c = Uchar.to_int c in + if c <= 0x7f then ( + 1 + ) else if c <= 0x7ff then ( + 2 + ) else if c <= 0xffff then ( + 3 + ) else if c <= 0x1fffff then ( + 4 + ) else ( + 5 ) let of_gen g : t = let buf = Buffer.create 32 in let rec aux () = match g() with | None -> Buffer.contents buf - | Some c -> code_to_string buf c; aux () + | Some c -> uchar_to_bytes c (Buffer.add_char buf); aux () in aux () let of_seq seq : t = let buf = Buffer.create 32 in - Seq.iter (code_to_string buf) seq; + Seq.iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) seq; Buffer.contents buf let of_iter i : t = let buf = Buffer.create 32 in - i (code_to_string buf); + i (fun c -> uchar_to_bytes c (Buffer.add_char buf)); Buffer.contents buf let of_list l : t = - let buf = Buffer.create 32 in - List.iter (code_to_string buf) l; - Buffer.contents buf + let len = List.fold_left (fun n c -> n + uchar_num_bytes c) 0 l in + if len > Sys.max_string_length then ( + invalid_arg "CCUtf8_string.of_list: string size limit exceeded"; + ); + let buf = Bytes.make len '\000' in + let i = ref 0 in + List.iter + (fun c -> + uchar_to_bytes c + (fun byte -> + Bytes.unsafe_set buf !i byte; + incr i)) + l; + assert (!i = len); + Bytes.unsafe_to_string buf let map f s : t = let buf = Buffer.create (n_bytes s) in - iter (fun c -> code_to_string buf (f c)) s; + iter (fun c -> uchar_to_bytes (f c) (Buffer.add_char buf)) s; Buffer.contents buf let filter_map f s : t = @@ -238,13 +266,15 @@ let filter_map f s : t = iter (fun c -> match f c with | None -> () - | Some c -> code_to_string buf c) + | Some c -> uchar_to_bytes c (Buffer.add_char buf)) s; Buffer.contents buf let flat_map f s : t = let buf = Buffer.create (n_bytes s) in - iter (fun c -> iter (code_to_string buf) (f c)) s; + iter + (fun c -> iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) (f c)) + s; Buffer.contents buf let append = Stdlib.(^) diff --git a/src/core/CCUtf8_string.mli b/src/core/CCUtf8_string.mli index 5f5a1209..93c00616 100644 --- a/src/core/CCUtf8_string.mli +++ b/src/core/CCUtf8_string.mli @@ -90,6 +90,13 @@ val of_iter : uchar iter -> t (** Build a string from unicode codepoints @since 2.8 *) +val uchar_to_bytes : uchar -> char iter +(** Translate the unicode codepoint to a list of utf-8 bytes. + This can be used, for example, in combination with {!Buffer.add_char} + on a pre-allocated buffer to add the bytes one by one (despite its name, + {!Buffer.add_char} takes individual bytes, not unicode codepoints). + @since NEXT_RELEASE *) + val of_gen : uchar gen -> t val of_list : uchar list -> t