feat(utf8): add and expose uchar_to_bytes

rather than encoding to buffers directly, we can expose an iterator over the bytes of an uchar.
2026-01-28 11:54:51 -05:00 · 2020-12-07 23:31:05 -05:00 · 2020-12-07 23:31:05 -05:00 · 3918ed1155
commit 3918ed1155
parent f9ee8d0e89
2 changed files with 62 additions and 25 deletions
--- a/src/core/CCUtf8_string.ml
+++ b/src/core/CCUtf8_string.ml
@ -179,58 +179,86 @@ let to_list ?(idx=0) s : uchar list =
   There are various equally trivial versions of this around.
 *)
-let code_to_string buf (c:uchar) : unit =
+let[@inline] uchar_to_bytes (c:uchar) (f:char -> unit) : unit =
  let c = Uchar.to_int c in
  let mask = 0b111111 in
  assert (Uchar.is_valid c);
  if c <= 0x7f then (
-    Buffer.add_char buf (Char.unsafe_chr c)
+    f (Char.unsafe_chr c)
  ) else if c <= 0x7ff then (
-    Buffer.add_char buf (Char.unsafe_chr (0xc0 lor (c lsr 6)));
+    f (Char.unsafe_chr (0xc0 lor (c lsr 6)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
+    f (Char.unsafe_chr (0x80 lor (c land mask)));
  ) else if c <= 0xffff then (
-    Buffer.add_char buf (Char.unsafe_chr (0xe0 lor (c lsr 12)));
+    f (Char.unsafe_chr (0xe0 lor (c lsr 12)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
+    f (Char.unsafe_chr (0x80 lor (c land mask)));
  ) else if c <= 0x1fffff then (
-    Buffer.add_char buf (Char.unsafe_chr (0xf0 lor (c lsr 18)));
+    f (Char.unsafe_chr (0xf0 lor (c lsr 18)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
+    f (Char.unsafe_chr (0x80 lor (c land mask)));
  ) else (
-    Buffer.add_char buf (Char.unsafe_chr (0xf8 lor (c lsr 24)));
+    f (Char.unsafe_chr (0xf8 lor (c lsr 24)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 18) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 12) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
+    f (Char.unsafe_chr (0x80 lor ((c lsr 6) land mask)));
-    Buffer.add_char buf (Char.unsafe_chr (0x80 lor (c land mask)));
+    f (Char.unsafe_chr (0x80 lor (c land mask)));
  )
 (* number of bytes required to encode this codepoint. A skeleton version
   of {!uchar_to_bytes}. *)
 let[@inline] uchar_num_bytes (c:uchar) : int =
  let c = Uchar.to_int c in
  if c <= 0x7f then (
    1
  ) else if c <= 0x7ff then (
    2
  ) else if c <= 0xffff then (
    3
  ) else if c <= 0x1fffff then (
    4
  ) else (
    5
  )
 let of_gen g : t =
  let buf = Buffer.create 32 in
  let rec aux () = match g() with
    | None -> Buffer.contents buf
-    | Some c -> code_to_string buf c; aux ()
+    | Some c -> uchar_to_bytes c (Buffer.add_char buf); aux ()
  in
  aux ()
 let of_seq seq : t =
  let buf = Buffer.create 32 in
-  Seq.iter (code_to_string buf) seq;
+  Seq.iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) seq;
  Buffer.contents buf
 let of_iter i : t =
  let buf = Buffer.create 32 in
-  i (code_to_string buf);
+  i (fun c -> uchar_to_bytes c (Buffer.add_char buf));
  Buffer.contents buf
 let of_list l : t =
-  let buf = Buffer.create 32 in
+  let len = List.fold_left (fun n c -> n + uchar_num_bytes c) 0 l in
-  List.iter (code_to_string buf) l;
+  if len > Sys.max_string_length then (
-  Buffer.contents buf
+    invalid_arg "CCUtf8_string.of_list: string size limit exceeded";
  );
  let buf = Bytes.make len '\000' in
  let i = ref 0 in
  List.iter
    (fun c ->
       uchar_to_bytes c
         (fun byte ->
           Bytes.unsafe_set buf !i byte;
           incr i))
    l;
  assert (!i = len);
  Bytes.unsafe_to_string buf
 let map f s : t =
  let buf = Buffer.create (n_bytes s) in
-  iter (fun c -> code_to_string buf (f c)) s;
+  iter (fun c -> uchar_to_bytes (f c) (Buffer.add_char buf)) s;
  Buffer.contents buf
 let filter_map f s : t =
@ -238,13 +266,15 @@ let filter_map f s : t =
  iter
    (fun c -> match f c with
       | None -> ()
-       | Some c -> code_to_string buf c)
+       | Some c -> uchar_to_bytes c (Buffer.add_char buf))
    s;
  Buffer.contents buf
 let flat_map f s : t =
  let buf = Buffer.create (n_bytes s) in
-  iter (fun c -> iter (code_to_string buf) (f c)) s;
+  iter
    (fun c -> iter (fun c -> uchar_to_bytes c (Buffer.add_char buf)) (f c))
    s;
  Buffer.contents buf
 let append = Stdlib.(^)
--- a/src/core/CCUtf8_string.mli
+++ b/src/core/CCUtf8_string.mli
@ -90,6 +90,13 @@ val of_iter : uchar iter -> t
 (** Build a string from unicode codepoints
    @since 2.8 *)
 val uchar_to_bytes : uchar -> char iter
 (** Translate the unicode codepoint to a list of utf-8 bytes.
    This can be used, for example, in combination with {!Buffer.add_char}
    on a pre-allocated buffer to add the bytes one by one (despite its name,
      {!Buffer.add_char} takes individual bytes, not unicode codepoints).
    @since NEXT_RELEASE *)
 val of_gen : uchar gen -> t
 val of_list : uchar list -> t