mirror of
https://github.com/c-cube/ocaml-containers.git
synced 2025-12-06 11:15:31 -05:00
utf8string: detect overlong encodings
- also, stronger tests for utf8string
This commit is contained in:
parent
fe88bafe77
commit
ea4a4e4ffb
1 changed files with 27 additions and 12 deletions
|
|
@ -37,15 +37,20 @@ exception Malformed of string * int
|
||||||
@raise Malformed if an invalid substring is met *)
|
@raise Malformed if an invalid substring is met *)
|
||||||
let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
||||||
let open Dec in
|
let open Dec in
|
||||||
|
let malformed st = raise (Malformed (st.s,st.i)) in
|
||||||
(* read a multi-byte character.
|
(* read a multi-byte character.
|
||||||
@param acc the accumulator (containing the first byte of the char)
|
@param acc the accumulator (containing the first byte of the char)
|
||||||
@param n_bytes number of bytes to read (i.e. [width char - 1]) *)
|
@param n_bytes number of bytes to read (i.e. [width char - 1])
|
||||||
let read_multi n_bytes acc =
|
@param overlong minimal bound on second byte (to detect overlong encoding)
|
||||||
|
*)
|
||||||
|
let read_multi ?(overlong=0) n_bytes acc =
|
||||||
(* inner loop j = 1..jmax *)
|
(* inner loop j = 1..jmax *)
|
||||||
let rec aux j acc =
|
let rec aux j acc =
|
||||||
let c = Char.code st.s.[ st.i + j] in
|
let c = Char.code st.s.[ st.i + j] in
|
||||||
(* check that c is in 0b10xxxxxx *)
|
(* check that c is in 0b10xxxxxx *)
|
||||||
if c lsr 6 <> 0b10 then raise (Malformed (st.s,st.i));
|
if c lsr 6 <> 0b10 then malformed st;
|
||||||
|
(* overlong encoding? *)
|
||||||
|
if j=1 && overlong<>0 && (c land 0b111111) < overlong then malformed st;
|
||||||
(* except for first, each char gives 6 bits *)
|
(* except for first, each char gives 6 bits *)
|
||||||
let next = (acc lsl 6) lor (c land 0b111111) in
|
let next = (acc lsl 6) lor (c land 0b111111) in
|
||||||
if j = n_bytes then (
|
if j = n_bytes then (
|
||||||
|
|
@ -54,7 +59,7 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
||||||
st.i <- st.i + j + 1; (* +1 for first char *)
|
st.i <- st.i + j + 1; (* +1 for first char *)
|
||||||
yield (Uchar.unsafe_of_int next)
|
yield (Uchar.unsafe_of_int next)
|
||||||
) else (
|
) else (
|
||||||
raise (Malformed (st.s,st.i))
|
malformed st;
|
||||||
)
|
)
|
||||||
) else (
|
) else (
|
||||||
aux (j+1) next
|
aux (j+1) next
|
||||||
|
|
@ -66,7 +71,7 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
||||||
aux 1 acc (* start with j=1, first char is already proccessed! *)
|
aux 1 acc (* start with j=1, first char is already proccessed! *)
|
||||||
) else (
|
) else (
|
||||||
(* char is truncated *)
|
(* char is truncated *)
|
||||||
raise (Malformed (st.s,st.i))
|
malformed st;
|
||||||
)
|
)
|
||||||
in
|
in
|
||||||
if st.i >= st.len then (
|
if st.i >= st.len then (
|
||||||
|
|
@ -80,11 +85,18 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a =
|
||||||
st.i <- 1 + st.i;
|
st.i <- 1 + st.i;
|
||||||
yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *)
|
yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *)
|
||||||
| '\194' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *)
|
| '\194' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *)
|
||||||
| '\224' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *)
|
| '\225' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *)
|
||||||
| '\240' .. '\244' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *)
|
| '\241' .. '\244' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *)
|
||||||
|
| '\224' ->
|
||||||
|
(* overlong: if next byte is < than [0b001000000] then the char
|
||||||
|
would fit in 1 byte *)
|
||||||
|
read_multi ~overlong:0b00100000 2 ((Char.code c) land 0b1111) (* 1110zzzz *)
|
||||||
|
| '\240' ->
|
||||||
|
(* overlong: if next byte is < than [0b000100000] then the char
|
||||||
|
would fit in 2 bytes *)
|
||||||
|
read_multi ~overlong:0b00010000 3 ((Char.code c) land 0b111) (* 11110uuu *)
|
||||||
| '\128' .. '\193' (* 192,193 are forbidden *)
|
| '\128' .. '\193' (* 192,193 are forbidden *)
|
||||||
| '\245' .. '\255' ->
|
| '\245' .. '\255' -> malformed st;
|
||||||
raise (Malformed (st.s,st.i))
|
|
||||||
)
|
)
|
||||||
|
|
||||||
let to_gen ?(idx=0) str : uchar gen =
|
let to_gen ?(idx=0) str : uchar gen =
|
||||||
|
|
@ -248,6 +260,9 @@ let uutf_to_seq s f =
|
||||||
(*$T
|
(*$T
|
||||||
not (is_valid "\192\181")
|
not (is_valid "\192\181")
|
||||||
not (is_valid "\193\143")
|
not (is_valid "\193\143")
|
||||||
|
not (is_valid "\224\151\167")
|
||||||
|
not (is_valid "\224\137\165")
|
||||||
|
is_valid "\240\151\189\163"
|
||||||
*)
|
*)
|
||||||
|
|
||||||
(*$QR & ~long_factor:40
|
(*$QR & ~long_factor:40
|
||||||
|
|
@ -272,7 +287,7 @@ let uutf_to_seq s f =
|
||||||
|
|
||||||
(* compare with uutf *)
|
(* compare with uutf *)
|
||||||
|
|
||||||
(*$QR & ~long_factor:40 ~count:10_000
|
(*$QR & ~long_factor:40 ~count:100_000
|
||||||
Q.string (fun s ->
|
Q.string (fun s ->
|
||||||
let v1 = is_valid s in
|
let v1 = is_valid s in
|
||||||
let v2 = uutf_is_valid s in
|
let v2 = uutf_is_valid s in
|
||||||
|
|
@ -281,7 +296,7 @@ let uutf_to_seq s f =
|
||||||
)
|
)
|
||||||
*)
|
*)
|
||||||
|
|
||||||
(*$QR & ~long_factor:40 ~count:10_000
|
(*$QR & ~long_factor:40 ~count:100_000
|
||||||
Q.string (fun s ->
|
Q.string (fun s ->
|
||||||
Q.assume (is_valid s && uutf_is_valid s);
|
Q.assume (is_valid s && uutf_is_valid s);
|
||||||
let pp s = Q.Print.(list pp_uchar) s in
|
let pp s = Q.Print.(list pp_uchar) s in
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue