From bf65d226670d42215cc622df7216a5a571297540 Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Thu, 22 Feb 2024 13:45:04 +0100 Subject: [PATCH 1/4] fix: handle utf8 characters in the dune files(#9728) Signed-off-by: Alpha DIALLO Signed-off-by: Etienne Millon --- src/dune_sexp/escape.ml | 122 +++++++++++++++--- src/dune_sexp/escape.mli | 5 + src/dune_sexp/lexer.mll | 36 ++++-- .../formatting/non-ascii-characters.t | 25 +++- 4 files changed, 155 insertions(+), 33 deletions(-) diff --git a/src/dune_sexp/escape.ml b/src/dune_sexp/escape.ml index 8eb26e4c71c..2b00288ec73 100644 --- a/src/dune_sexp/escape.ml +++ b/src/dune_sexp/escape.ml @@ -1,17 +1,82 @@ open! Stdune +module Utf8 = struct + (* + The first byte of an utf8 character gives the size in bytes of the utf8: + + 0xxxxxxxx -> 1 + 110xxxxxx -> 2 + 1110xxxxx -> 3 + 11110xxxx -> 4 + *) + let utf8_byte_length u = + match Char.code u with + | u when u < 128 -> 1 + | u when u < 194 -> 0 + | u when u < 224 -> 2 + | u when u < 240 -> 3 + | u when u < 245 -> 4 + | _ -> 0 + ;; + + let unsafe_get s i = String.unsafe_get s i |> Char.code + let next_utf8_length s i = utf8_byte_length (String.unsafe_get s i) + + let is_utf8_valid s i l = + assert (String.length s >= l); + match l with + | 1 -> true + | 2 -> + let b1 = unsafe_get s (i + 1) in + if b1 lsr 6 != 0b10 then false else true + | 3 -> + let b0 = unsafe_get s i in + let b1 = unsafe_get s (i + 1) in + let b2 = unsafe_get s (i + 2) in + if b2 lsr 6 != 0b10 + then false + else ( + match b0 with + | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then false else true + | 0xED -> if b1 < 0x80 || 0x9F < b1 then false else true + | _ -> if b1 lsr 6 != 0b10 then false else true) + | 4 -> + let b0 = unsafe_get s i in + let b1 = unsafe_get s (i + 1) in + let b2 = unsafe_get s (i + 2) in + let b3 = unsafe_get s (i + 3) in + if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 + then false + else ( + match b0 with + | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then false else true + | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then false else true + | _ -> if b1 lsr 6 != 0b10 then false else true) + | _ -> false + ;; +end + let quote_length s = let n = ref 0 in let len = String.length s in - for i = 0 to len - 1 do - n - := !n - + - match String.unsafe_get s i with - | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 - | '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1 - | ' ' .. '~' -> 1 - | _ -> 4 + let i = ref 0 in + while !i < len do + (n + := !n + + + match String.unsafe_get s !i with + | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2 + | '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1 + | ' ' .. '~' -> 1 + | _ -> + let uchar_len = Utf8.next_utf8_length s !i in + (match Utf8.is_utf8_valid s !i uchar_len with + | true -> + assert (uchar_len > 1 && uchar_len < 5); + i := !i + uchar_len - 1; + uchar_len + | false -> 4)); + incr i done; !n ;; @@ -19,8 +84,9 @@ let quote_length s = let escape_to s ~dst:s' ~ofs = let n = ref ofs in let len = String.length s in - for i = 0 to len - 1 do - (match String.unsafe_get s i with + let i = ref 0 in + while !i < len do + (match String.unsafe_get s !i with | ('\"' | '\\') as c -> Bytes.unsafe_set s' !n '\\'; incr n; @@ -41,21 +107,35 @@ let escape_to s ~dst:s' ~ofs = Bytes.unsafe_set s' !n '\\'; incr n; Bytes.unsafe_set s' !n 'b' - | '%' when i + 1 < len && s.[i + 1] = '{' -> + | '%' when !i + 1 < len && s.[!i + 1] = '{' -> Bytes.unsafe_set s' !n '\\'; incr n; Bytes.unsafe_set s' !n '%' | ' ' .. '~' as c -> Bytes.unsafe_set s' !n c | c -> - let a = Char.code c in - Bytes.unsafe_set s' !n '\\'; - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100))); - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10))); - incr n; - Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))); - incr n + let uchar_len = Utf8.next_utf8_length s !i in + (match Utf8.is_utf8_valid s !i uchar_len with + | true -> + assert (uchar_len > 1 && uchar_len < 5); + Bytes.unsafe_set s' !n (String.unsafe_get s !i); + Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1)); + if uchar_len > 2 + then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2)); + if uchar_len > 3 + then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3)); + n := !n + uchar_len - 1; + i := !i + uchar_len - 1 + | false -> + let a = Char.code c in + Bytes.unsafe_set s' !n '\\'; + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100))); + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10))); + incr n; + Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))))); + incr n; + incr i done ;; diff --git a/src/dune_sexp/escape.mli b/src/dune_sexp/escape.mli index f4389a59735..1fe64470013 100644 --- a/src/dune_sexp/escape.mli +++ b/src/dune_sexp/escape.mli @@ -1,2 +1,7 @@ +module Utf8 : sig + val next_utf8_length : string -> int -> int + val is_utf8_valid : string -> int -> int -> bool +end + val escaped : string -> string val quoted : string -> string diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index 46aec74b1d5..b4005a396b6 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -16,11 +16,12 @@ end type t = with_comments:bool -> Lexing.lexbuf -> Token.t -let error ?(delta = 0) lexbuf message = +let error ?(delta = 0) ?(delta_stop = 0) lexbuf message = let start = Lexing.lexeme_start_p lexbuf in + let stop = Lexing.lexeme_end_p lexbuf in let loc = Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta } - ~stop:(Lexing.lexeme_end_p lexbuf) + ~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop } in User_error.raise ~loc [ Pp.text message ] @@ -144,6 +145,8 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] let varname_char = atom_char # [ ':' '%' '{' '}' ] +let non_ascii = ['\128'-'\255'] + rule token with_comments = parse | newline { Lexing.new_line lexbuf; token with_comments lexbuf } @@ -352,13 +355,30 @@ and template_variable = parse } | '}' | eof { error lexbuf "%{...} forms cannot be empty" } - | (varname_char* as skip) (_ as other) - | (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other) + | (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other) + | (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other) { - error - ~delta:(String.length skip) - lexbuf - (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) + let utf_len = String.length maybe_utf in + let uchar = + if utf_len > 1 then + let uchar_len = Escape.Utf8.next_utf8_length maybe_utf 0 in + if uchar_len <= utf_len && Escape.Utf8.is_utf8_valid maybe_utf 0 uchar_len then + Some (String.sub maybe_utf ~pos:0 ~len:uchar_len, uchar_len) + else None + else None + in + match uchar with + | Some (uchar, len) -> + error + ~delta:(String.length skip) + ~delta_stop:(-len) + lexbuf + (Printf.sprintf "The character %s is not allowed inside %%{...} forms" uchar) + | _ -> + error + ~delta:(String.length skip) + lexbuf + (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) } { diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index c11fd94e414..dcf44519b4e 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -1,13 +1,30 @@ -How the non-ASCII characters are handled, this is also related to the issue #9728 +Utf8 characters are handled for now, this is also related to the issue #9728 $ dune format-dune-file < ("É") + > ("Éff ĎúÑȨ") > EOF - ("\195\137") + ("Éff ĎúÑȨ") $ dune format-dune-file < (run foo %{bin:é}) > EOF File "", line 1, characters 15-16: - Error: The character '\195' is not allowed inside %{...} forms + Error: The character é is not allowed inside %{...} forms + [1] + + $ dune format-dune-file < (echo "hÉllo") + > EOF + (echo "hÉllo") + + $ dune format-dune-file < (echo "É") + > EOF + (echo "É") + + $ dune format-dune-file < (Écho "hello") + > EOF + File "", line 1, characters 1-1: + Error: Invalid . file [1] From 8f9c9c81046503316e811e9219948fd5eaabdd3a Mon Sep 17 00:00:00 2001 From: Alpha DIALLO Date: Fri, 15 Mar 2024 18:11:12 +0100 Subject: [PATCH 2/4] Add the changes file and revert the modification of the lexer Signed-off-by: Alpha DIALLO Signed-off-by: Etienne Millon --- doc/changes/10113.md | 2 ++ src/dune_sexp/lexer.mll | 36 +++++-------------- .../formatting/non-ascii-characters.t | 2 +- 3 files changed, 11 insertions(+), 29 deletions(-) create mode 100644 doc/changes/10113.md diff --git a/doc/changes/10113.md b/doc/changes/10113.md new file mode 100644 index 00000000000..978357860da --- /dev/null +++ b/doc/changes/10113.md @@ -0,0 +1,2 @@ +- fix: handle utf8 characters in the dune files. (#10113, fixes #9728, + @moyodiallo) diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll index b4005a396b6..46aec74b1d5 100644 --- a/src/dune_sexp/lexer.mll +++ b/src/dune_sexp/lexer.mll @@ -16,12 +16,11 @@ end type t = with_comments:bool -> Lexing.lexbuf -> Token.t -let error ?(delta = 0) ?(delta_stop = 0) lexbuf message = +let error ?(delta = 0) lexbuf message = let start = Lexing.lexeme_start_p lexbuf in - let stop = Lexing.lexeme_end_p lexbuf in let loc = Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta } - ~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop } + ~stop:(Lexing.lexeme_end_p lexbuf) in User_error.raise ~loc [ Pp.text message ] @@ -145,8 +144,6 @@ let hexdigit = ['0'-'9' 'a'-'f' 'A'-'F'] let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255'] let varname_char = atom_char # [ ':' '%' '{' '}' ] -let non_ascii = ['\128'-'\255'] - rule token with_comments = parse | newline { Lexing.new_line lexbuf; token with_comments lexbuf } @@ -355,30 +352,13 @@ and template_variable = parse } | '}' | eof { error lexbuf "%{...} forms cannot be empty" } - | (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other) - | (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other) + | (varname_char* as skip) (_ as other) + | (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other) { - let utf_len = String.length maybe_utf in - let uchar = - if utf_len > 1 then - let uchar_len = Escape.Utf8.next_utf8_length maybe_utf 0 in - if uchar_len <= utf_len && Escape.Utf8.is_utf8_valid maybe_utf 0 uchar_len then - Some (String.sub maybe_utf ~pos:0 ~len:uchar_len, uchar_len) - else None - else None - in - match uchar with - | Some (uchar, len) -> - error - ~delta:(String.length skip) - ~delta_stop:(-len) - lexbuf - (Printf.sprintf "The character %s is not allowed inside %%{...} forms" uchar) - | _ -> - error - ~delta:(String.length skip) - lexbuf - (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) + error + ~delta:(String.length skip) + lexbuf + (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other) } { diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index dcf44519b4e..923931b913a 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -9,7 +9,7 @@ Utf8 characters are handled for now, this is also related to the issue #9728 > (run foo %{bin:é}) > EOF File "", line 1, characters 15-16: - Error: The character é is not allowed inside %{...} forms + Error: The character '\195' is not allowed inside %{...} forms [1] $ dune format-dune-file < Date: Tue, 19 Mar 2024 11:33:42 +0100 Subject: [PATCH 3/4] Use uutf Also leaving a note about using the stdlib later. Signed-off-by: Etienne Millon --- boot/libs.ml | 2 +- src/dune_sexp/dune | 2 +- src/dune_sexp/escape.ml | 96 +++++++------------ src/dune_sexp/escape.mli | 5 - .../formatting/non-ascii-characters.t | 5 + 5 files changed, 40 insertions(+), 70 deletions(-) diff --git a/boot/libs.ml b/boot/libs.ml index 6404a823ab1..8d5905b1f66 100644 --- a/boot/libs.ml +++ b/boot/libs.ml @@ -14,6 +14,7 @@ let local_libraries = ; ("vendor/fiber/src", Some "Fiber", false, None) ; ("src/dune_console", Some "Dune_console", false, None) ; ("src/memo", Some "Memo", false, None) + ; ("vendor/uutf", None, false, None) ; ("src/dune_sexp", Some "Dune_sexp", false, None) ; ("src/ocaml-config", Some "Ocaml_config", false, None) ; ("src/ocaml", Some "Ocaml", false, None) @@ -23,7 +24,6 @@ let local_libraries = ; ("otherlibs/dune-rpc/private", Some "Dune_rpc_private", false, None) ; ("src/dune_config", Some "Dune_config", false, None) ; ("vendor/sha", None, false, None) - ; ("vendor/uutf", None, false, None) ; ("vendor/opam/src/core", None, false, None) ; ("vendor/opam-file-format", None, false, None) ; ("vendor/opam/src/format", None, false, None) diff --git a/src/dune_sexp/dune b/src/dune_sexp/dune index 4b661198c6e..d8508fcd1ab 100644 --- a/src/dune_sexp/dune +++ b/src/dune_sexp/dune @@ -1,7 +1,7 @@ (library (name dune_sexp) (synopsis "[Internal] S-expression library") - (libraries stdune) + (libraries stdune dune_uutf) (instrumentation (backend bisect_ppx))) diff --git a/src/dune_sexp/escape.ml b/src/dune_sexp/escape.ml index 2b00288ec73..f262fb29ed2 100644 --- a/src/dune_sexp/escape.ml +++ b/src/dune_sexp/escape.ml @@ -1,60 +1,34 @@ open! Stdune -module Utf8 = struct - (* - The first byte of an utf8 character gives the size in bytes of the utf8: +(** Note: on OCaml >= 4.14, this can be switched to the following (and the + dependency to [Uutf] can be removed) - 0xxxxxxxx -> 1 - 110xxxxxx -> 2 - 1110xxxxx -> 3 - 11110xxxx -> 4 - *) - let utf8_byte_length u = - match Char.code u with - | u when u < 128 -> 1 - | u when u < 194 -> 0 - | u when u < 224 -> 2 - | u when u < 240 -> 3 - | u when u < 245 -> 4 - | _ -> 0 - ;; - - let unsafe_get s i = String.unsafe_get s i |> Char.code - let next_utf8_length s i = utf8_byte_length (String.unsafe_get s i) - - let is_utf8_valid s i l = - assert (String.length s >= l); - match l with - | 1 -> true - | 2 -> - let b1 = unsafe_get s (i + 1) in - if b1 lsr 6 != 0b10 then false else true - | 3 -> - let b0 = unsafe_get s i in - let b1 = unsafe_get s (i + 1) in - let b2 = unsafe_get s (i + 2) in - if b2 lsr 6 != 0b10 - then false + {[ + let next_valid_utf8_length s i = + let decode = String.get_utf_8_uchar s i in + Option.some_if (Uchar.utf_decode_is_valid decode) (Uchar.utf_decode_length decode) + ;; + ]} *) +let next_valid_utf8_uchar_len s i = + let pos = ref i in + let buf = Bytes.create 1 in + let decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in + let rec go () = + match Uutf.decode decoder with + | `Await -> + if !pos >= String.length s + then None else ( - match b0 with - | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then false else true - | 0xED -> if b1 < 0x80 || 0x9F < b1 then false else true - | _ -> if b1 lsr 6 != 0b10 then false else true) - | 4 -> - let b0 = unsafe_get s i in - let b1 = unsafe_get s (i + 1) in - let b2 = unsafe_get s (i + 2) in - let b3 = unsafe_get s (i + 3) in - if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10 - then false - else ( - match b0 with - | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then false else true - | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then false else true - | _ -> if b1 lsr 6 != 0b10 then false else true) - | _ -> false - ;; -end + Bytes.set buf 0 (String.get s !pos); + incr pos; + Uutf.Manual.src decoder buf 0 1; + go ()) + | `Uchar _ -> Some (!pos - i) + | `Malformed _ -> None + | `End -> Code_error.raise "next_valid_utf8_uchar: `End" [] + in + go () +;; let quote_length s = let n = ref 0 in @@ -69,13 +43,11 @@ let quote_length s = | '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1 | ' ' .. '~' -> 1 | _ -> - let uchar_len = Utf8.next_utf8_length s !i in - (match Utf8.is_utf8_valid s !i uchar_len with - | true -> - assert (uchar_len > 1 && uchar_len < 5); + (match next_valid_utf8_uchar_len s !i with + | Some uchar_len -> i := !i + uchar_len - 1; uchar_len - | false -> 4)); + | None -> 4)); incr i done; !n @@ -113,10 +85,8 @@ let escape_to s ~dst:s' ~ofs = Bytes.unsafe_set s' !n '%' | ' ' .. '~' as c -> Bytes.unsafe_set s' !n c | c -> - let uchar_len = Utf8.next_utf8_length s !i in - (match Utf8.is_utf8_valid s !i uchar_len with - | true -> - assert (uchar_len > 1 && uchar_len < 5); + (match next_valid_utf8_uchar_len s !i with + | Some uchar_len -> Bytes.unsafe_set s' !n (String.unsafe_get s !i); Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1)); if uchar_len > 2 @@ -125,7 +95,7 @@ let escape_to s ~dst:s' ~ofs = then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3)); n := !n + uchar_len - 1; i := !i + uchar_len - 1 - | false -> + | None -> let a = Char.code c in Bytes.unsafe_set s' !n '\\'; incr n; diff --git a/src/dune_sexp/escape.mli b/src/dune_sexp/escape.mli index 1fe64470013..f4389a59735 100644 --- a/src/dune_sexp/escape.mli +++ b/src/dune_sexp/escape.mli @@ -1,7 +1,2 @@ -module Utf8 : sig - val next_utf8_length : string -> int -> int - val is_utf8_valid : string -> int -> int -> bool -end - val escaped : string -> string val quoted : string -> string diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t index 923931b913a..7b444ba7d75 100644 --- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t +++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t @@ -28,3 +28,8 @@ Utf8 characters are handled for now, this is also related to the issue #9728 File "", line 1, characters 1-1: Error: Invalid . file [1] + + $ bash -c "printf '(echo \"%b\")' '\xc0'"| dune format-dune-file + (echo "\192") + $ bash -c "printf '(echo \"%b\")' '\xf0'"| dune format-dune-file + (echo "\240") From f413a42e946d12b4624f5ff0f4dbfac33d0839de Mon Sep 17 00:00:00 2001 From: Etienne Millon Date: Tue, 19 Mar 2024 13:10:15 +0100 Subject: [PATCH 4/4] changelog Signed-off-by: Etienne Millon --- doc/changes/10113.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/changes/10113.md b/doc/changes/10113.md index 978357860da..dff4f391fd0 100644 --- a/doc/changes/10113.md +++ b/doc/changes/10113.md @@ -1,2 +1,2 @@ -- fix: handle utf8 characters in the dune files. (#10113, fixes #9728, - @moyodiallo) +- dune file formatting: output utf8 if input is correctly encoded (#10113, + fixes #9728, @moyodiallo)