diff --git a/src/core/CCUtf8_string.ml b/src/core/CCUtf8_string.ml index c3b82db0..5cac7bb8 100644 --- a/src/core/CCUtf8_string.ml +++ b/src/core/CCUtf8_string.ml @@ -37,15 +37,20 @@ exception Malformed of string * int @raise Malformed if an invalid substring is met *) let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a = let open Dec in + let malformed st = raise (Malformed (st.s,st.i)) in (* read a multi-byte character. - @param acc the accumulator (containing the first byte of the char) - @param n_bytes number of bytes to read (i.e. [width char - 1]) *) - let read_multi n_bytes acc = + @param acc the accumulator (containing the first byte of the char) + @param n_bytes number of bytes to read (i.e. [width char - 1]) + @param overlong minimal bound on second byte (to detect overlong encoding) + *) + let read_multi ?(overlong=0) n_bytes acc = (* inner loop j = 1..jmax *) let rec aux j acc = let c = Char.code st.s.[ st.i + j] in (* check that c is in 0b10xxxxxx *) - if c lsr 6 <> 0b10 then raise (Malformed (st.s,st.i)); + if c lsr 6 <> 0b10 then malformed st; + (* overlong encoding? *) + if j=1 && overlong<>0 && (c land 0b111111) < overlong then malformed st; (* except for first, each char gives 6 bits *) let next = (acc lsl 6) lor (c land 0b111111) in if j = n_bytes then ( @@ -54,7 +59,7 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a = st.i <- st.i + j + 1; (* +1 for first char *) yield (Uchar.unsafe_of_int next) ) else ( - raise (Malformed (st.s,st.i)) + malformed st; ) ) else ( aux (j+1) next @@ -66,7 +71,7 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a = aux 1 acc (* start with j=1, first char is already proccessed! *) ) else ( (* char is truncated *) - raise (Malformed (st.s,st.i)) + malformed st; ) in if st.i >= st.len then ( @@ -80,11 +85,18 @@ let next_ (type a) (st : Dec.t) ~(yield:uchar -> a) ~(stop:unit -> a) () : a = st.i <- 1 + st.i; yield (Uchar.of_int @@ Char.code c) (* 0xxxxxxx *) | '\194' .. '\223' -> read_multi 1 ((Char.code c) land 0b11111) (* 110yyyyy *) - | '\224' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *) - | '\240' .. '\244' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *) + | '\225' .. '\239' -> read_multi 2 ((Char.code c) land 0b1111) (* 1110zzzz *) + | '\241' .. '\244' -> read_multi 3 ((Char.code c) land 0b111) (* 11110uuu *) + | '\224' -> + (* overlong: if next byte is < than [0b001000000] then the char + would fit in 1 byte *) + read_multi ~overlong:0b00100000 2 ((Char.code c) land 0b1111) (* 1110zzzz *) + | '\240' -> + (* overlong: if next byte is < than [0b000100000] then the char + would fit in 2 bytes *) + read_multi ~overlong:0b00010000 3 ((Char.code c) land 0b111) (* 11110uuu *) | '\128' .. '\193' (* 192,193 are forbidden *) - | '\245' .. '\255' -> - raise (Malformed (st.s,st.i)) + | '\245' .. '\255' -> malformed st; ) let to_gen ?(idx=0) str : uchar gen = @@ -248,6 +260,9 @@ let uutf_to_seq s f = (*$T not (is_valid "\192\181") not (is_valid "\193\143") + not (is_valid "\224\151\167") + not (is_valid "\224\137\165") + is_valid "\240\151\189\163" *) (*$QR & ~long_factor:40 @@ -272,7 +287,7 @@ let uutf_to_seq s f = (* compare with uutf *) -(*$QR & ~long_factor:40 ~count:10_000 +(*$QR & ~long_factor:40 ~count:100_000 Q.string (fun s -> let v1 = is_valid s in let v2 = uutf_is_valid s in @@ -281,7 +296,7 @@ let uutf_to_seq s f = ) *) -(*$QR & ~long_factor:40 ~count:10_000 +(*$QR & ~long_factor:40 ~count:100_000 Q.string (fun s -> Q.assume (is_valid s && uutf_is_valid s); let pp s = Q.Print.(list pp_uchar) s in