From bf65d226670d42215cc622df7216a5a571297540 Mon Sep 17 00:00:00 2001
From: Alpha DIALLO <moyodiallo@gmail.com>
Date: Thu, 22 Feb 2024 13:45:04 +0100
Subject: [PATCH 1/4] fix: handle utf8 characters in the dune files(#9728)

Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
Signed-off-by: Etienne Millon <me@emillon.org>
---
 src/dune_sexp/escape.ml                       | 122 +++++++++++++++---
 src/dune_sexp/escape.mli                      |   5 +
 src/dune_sexp/lexer.mll                       |  36 ++++--
 .../formatting/non-ascii-characters.t         |  25 +++-
 4 files changed, 155 insertions(+), 33 deletions(-)

diff --git a/src/dune_sexp/escape.ml b/src/dune_sexp/escape.ml
index 8eb26e4c71c..2b00288ec73 100644
--- a/src/dune_sexp/escape.ml
+++ b/src/dune_sexp/escape.ml
@@ -1,17 +1,82 @@
 open! Stdune
 
+module Utf8 = struct
+  (*
+     The first byte of an utf8 character gives the size in bytes of the utf8:
+
+     0xxxxxxxx -> 1
+     110xxxxxx -> 2
+     1110xxxxx -> 3
+     11110xxxx -> 4
+  *)
+  let utf8_byte_length u =
+    match Char.code u with
+    | u when u < 128 -> 1
+    | u when u < 194 -> 0
+    | u when u < 224 -> 2
+    | u when u < 240 -> 3
+    | u when u < 245 -> 4
+    | _ -> 0
+  ;;
+
+  let unsafe_get s i = String.unsafe_get s i |> Char.code
+  let next_utf8_length s i = utf8_byte_length (String.unsafe_get s i)
+
+  let is_utf8_valid s i l =
+    assert (String.length s >= l);
+    match l with
+    | 1 -> true
+    | 2 ->
+      let b1 = unsafe_get s (i + 1) in
+      if b1 lsr 6 != 0b10 then false else true
+    | 3 ->
+      let b0 = unsafe_get s i in
+      let b1 = unsafe_get s (i + 1) in
+      let b2 = unsafe_get s (i + 2) in
+      if b2 lsr 6 != 0b10
+      then false
+      else (
+        match b0 with
+        | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then false else true
+        | 0xED -> if b1 < 0x80 || 0x9F < b1 then false else true
+        | _ -> if b1 lsr 6 != 0b10 then false else true)
+    | 4 ->
+      let b0 = unsafe_get s i in
+      let b1 = unsafe_get s (i + 1) in
+      let b2 = unsafe_get s (i + 2) in
+      let b3 = unsafe_get s (i + 3) in
+      if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10
+      then false
+      else (
+        match b0 with
+        | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then false else true
+        | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then false else true
+        | _ -> if b1 lsr 6 != 0b10 then false else true)
+    | _ -> false
+  ;;
+end
+
 let quote_length s =
   let n = ref 0 in
   let len = String.length s in
-  for i = 0 to len - 1 do
-    n
-    := !n
-       +
-       match String.unsafe_get s i with
-       | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
-       | '%' -> if i + 1 < len && s.[i + 1] = '{' then 2 else 1
-       | ' ' .. '~' -> 1
-       | _ -> 4
+  let i = ref 0 in
+  while !i < len do
+    (n
+     := !n
+        +
+        match String.unsafe_get s !i with
+        | '\"' | '\\' | '\n' | '\t' | '\r' | '\b' -> 2
+        | '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1
+        | ' ' .. '~' -> 1
+        | _ ->
+          let uchar_len = Utf8.next_utf8_length s !i in
+          (match Utf8.is_utf8_valid s !i uchar_len with
+           | true ->
+             assert (uchar_len > 1 && uchar_len < 5);
+             i := !i + uchar_len - 1;
+             uchar_len
+           | false -> 4));
+    incr i
   done;
   !n
 ;;
@@ -19,8 +84,9 @@ let quote_length s =
 let escape_to s ~dst:s' ~ofs =
   let n = ref ofs in
   let len = String.length s in
-  for i = 0 to len - 1 do
-    (match String.unsafe_get s i with
+  let i = ref 0 in
+  while !i < len do
+    (match String.unsafe_get s !i with
      | ('\"' | '\\') as c ->
        Bytes.unsafe_set s' !n '\\';
        incr n;
@@ -41,21 +107,35 @@ let escape_to s ~dst:s' ~ofs =
        Bytes.unsafe_set s' !n '\\';
        incr n;
        Bytes.unsafe_set s' !n 'b'
-     | '%' when i + 1 < len && s.[i + 1] = '{' ->
+     | '%' when !i + 1 < len && s.[!i + 1] = '{' ->
        Bytes.unsafe_set s' !n '\\';
        incr n;
        Bytes.unsafe_set s' !n '%'
      | ' ' .. '~' as c -> Bytes.unsafe_set s' !n c
      | c ->
-       let a = Char.code c in
-       Bytes.unsafe_set s' !n '\\';
-       incr n;
-       Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
-       incr n;
-       Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
-       incr n;
-       Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10))));
-    incr n
+       let uchar_len = Utf8.next_utf8_length s !i in
+       (match Utf8.is_utf8_valid s !i uchar_len with
+        | true ->
+          assert (uchar_len > 1 && uchar_len < 5);
+          Bytes.unsafe_set s' !n (String.unsafe_get s !i);
+          Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1));
+          if uchar_len > 2
+          then Bytes.unsafe_set s' (!n + 2) (String.unsafe_get s (!i + 2));
+          if uchar_len > 3
+          then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3));
+          n := !n + uchar_len - 1;
+          i := !i + uchar_len - 1
+        | false ->
+          let a = Char.code c in
+          Bytes.unsafe_set s' !n '\\';
+          incr n;
+          Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 100)));
+          incr n;
+          Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a / 10 mod 10)));
+          incr n;
+          Bytes.unsafe_set s' !n (Char.unsafe_chr (48 + (a mod 10)))));
+    incr n;
+    incr i
   done
 ;;
 
diff --git a/src/dune_sexp/escape.mli b/src/dune_sexp/escape.mli
index f4389a59735..1fe64470013 100644
--- a/src/dune_sexp/escape.mli
+++ b/src/dune_sexp/escape.mli
@@ -1,2 +1,7 @@
+module Utf8 : sig
+  val next_utf8_length : string -> int -> int
+  val is_utf8_valid : string -> int -> int -> bool
+end
+
 val escaped : string -> string
 val quoted : string -> string
diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll
index 46aec74b1d5..b4005a396b6 100644
--- a/src/dune_sexp/lexer.mll
+++ b/src/dune_sexp/lexer.mll
@@ -16,11 +16,12 @@ end
 
 type t = with_comments:bool -> Lexing.lexbuf -> Token.t
 
-let error ?(delta = 0) lexbuf message =
+let error ?(delta = 0) ?(delta_stop = 0) lexbuf message =
   let start = Lexing.lexeme_start_p lexbuf in
+  let stop = Lexing.lexeme_end_p lexbuf in
   let loc =
     Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta }
-      ~stop:(Lexing.lexeme_end_p lexbuf)
+      ~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop }
   in
   User_error.raise ~loc [ Pp.text message ]
 
@@ -144,6 +145,8 @@ let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 let varname_char = atom_char # [ ':' '%' '{' '}' ]
 
+let non_ascii = ['\128'-'\255']
+
 rule token with_comments = parse
   | newline
     { Lexing.new_line lexbuf; token with_comments lexbuf }
@@ -352,13 +355,30 @@ and template_variable = parse
   }
   | '}' | eof
     { error lexbuf "%{...} forms cannot be empty" }
-  | (varname_char* as skip) (_ as other)
-  | (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other)
+  | (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other)
+  | (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other)
   {
-    error
-      ~delta:(String.length skip)
-      lexbuf
-      (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
+    let utf_len = String.length maybe_utf in
+    let uchar =
+      if utf_len > 1 then
+        let uchar_len = Escape.Utf8.next_utf8_length maybe_utf 0 in
+            if uchar_len <= utf_len && Escape.Utf8.is_utf8_valid maybe_utf 0 uchar_len then
+              Some (String.sub maybe_utf ~pos:0 ~len:uchar_len, uchar_len)
+            else None
+      else None
+    in
+    match uchar with
+    | Some (uchar, len) ->
+      error
+        ~delta:(String.length skip)
+        ~delta_stop:(-len)
+        lexbuf
+        (Printf.sprintf "The character %s is not allowed inside %%{...} forms" uchar)
+    | _ ->
+      error
+        ~delta:(String.length skip)
+        lexbuf
+        (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
   }
 
 {
diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
index c11fd94e414..dcf44519b4e 100644
--- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
+++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
@@ -1,13 +1,30 @@
-How the non-ASCII characters are handled, this is also related to the issue #9728
+Utf8 characters are handled for now, this is also related to the issue #9728
 
   $ dune format-dune-file <<EOF
-  > ("É")
+  > ("Éﬀ ĎúÑȨ")
   > EOF
-  ("\195\137")
+  ("Éﬀ ĎúÑȨ")
 
   $ dune format-dune-file <<EOF
   > (run foo %{bin:é})
   > EOF
   File "", line 1, characters 15-16:
-  Error: The character '\195' is not allowed inside %{...} forms
+  Error: The character é is not allowed inside %{...} forms
+  [1]
+
+  $ dune format-dune-file <<EOF
+  > (echo "hÉllo")
+  > EOF
+  (echo "hÉllo")
+
+  $ dune format-dune-file <<EOF
+  > (echo "É")
+  > EOF
+  (echo "É")
+
+  $ dune format-dune-file <<EOF
+  > (Écho "hello")
+  > EOF
+  File "", line 1, characters 1-1:
+  Error: Invalid . file
   [1]

From 8f9c9c81046503316e811e9219948fd5eaabdd3a Mon Sep 17 00:00:00 2001
From: Alpha DIALLO <moyodiallo@gmail.com>
Date: Fri, 15 Mar 2024 18:11:12 +0100
Subject: [PATCH 2/4] Add the changes file and revert the modification of the
 lexer

Signed-off-by: Alpha DIALLO <moyodiallo@gmail.com>
Signed-off-by: Etienne Millon <me@emillon.org>
---
 doc/changes/10113.md                          |  2 ++
 src/dune_sexp/lexer.mll                       | 36 +++++--------------
 .../formatting/non-ascii-characters.t         |  2 +-
 3 files changed, 11 insertions(+), 29 deletions(-)
 create mode 100644 doc/changes/10113.md

diff --git a/doc/changes/10113.md b/doc/changes/10113.md
new file mode 100644
index 00000000000..978357860da
--- /dev/null
+++ b/doc/changes/10113.md
@@ -0,0 +1,2 @@
+- fix: handle utf8 characters in the dune files. (#10113, fixes #9728,
+  @moyodiallo)
diff --git a/src/dune_sexp/lexer.mll b/src/dune_sexp/lexer.mll
index b4005a396b6..46aec74b1d5 100644
--- a/src/dune_sexp/lexer.mll
+++ b/src/dune_sexp/lexer.mll
@@ -16,12 +16,11 @@ end
 
 type t = with_comments:bool -> Lexing.lexbuf -> Token.t
 
-let error ?(delta = 0) ?(delta_stop = 0) lexbuf message =
+let error ?(delta = 0) lexbuf message =
   let start = Lexing.lexeme_start_p lexbuf in
-  let stop = Lexing.lexeme_end_p lexbuf in
   let loc =
     Loc.create ~start:{ start with pos_cnum = start.pos_cnum + delta }
-      ~stop:{ stop with pos_cnum = stop.pos_cnum + delta_stop }
+      ~stop:(Lexing.lexeme_end_p lexbuf)
   in
   User_error.raise ~loc [ Pp.text message ]
 
@@ -145,8 +144,6 @@ let hexdigit  = ['0'-'9' 'a'-'f' 'A'-'F']
 let atom_char = [^ ';' '(' ')' '"' '\000'-'\032' '\127'-'\255']
 let varname_char = atom_char # [ ':' '%' '{' '}' ]
 
-let non_ascii = ['\128'-'\255']
-
 rule token with_comments = parse
   | newline
     { Lexing.new_line lexbuf; token with_comments lexbuf }
@@ -355,30 +352,13 @@ and template_variable = parse
   }
   | '}' | eof
     { error lexbuf "%{...} forms cannot be empty" }
-  | (varname_char* as skip) (non_ascii* as maybe_utf) (_ as other)
-  | (varname_char+ ':' ((':' | varname_char)*) as skip) (non_ascii* as maybe_utf) (_ as other)
+  | (varname_char* as skip) (_ as other)
+  | (varname_char+ ':' ((':' | varname_char)*) as skip) (_ as other)
   {
-    let utf_len = String.length maybe_utf in
-    let uchar =
-      if utf_len > 1 then
-        let uchar_len = Escape.Utf8.next_utf8_length maybe_utf 0 in
-            if uchar_len <= utf_len && Escape.Utf8.is_utf8_valid maybe_utf 0 uchar_len then
-              Some (String.sub maybe_utf ~pos:0 ~len:uchar_len, uchar_len)
-            else None
-      else None
-    in
-    match uchar with
-    | Some (uchar, len) ->
-      error
-        ~delta:(String.length skip)
-        ~delta_stop:(-len)
-        lexbuf
-        (Printf.sprintf "The character %s is not allowed inside %%{...} forms" uchar)
-    | _ ->
-      error
-        ~delta:(String.length skip)
-        lexbuf
-        (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
+    error
+      ~delta:(String.length skip)
+      lexbuf
+      (Printf.sprintf "The character %C is not allowed inside %%{...} forms" other)
   }
 
 {
diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
index dcf44519b4e..923931b913a 100644
--- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
+++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
@@ -9,7 +9,7 @@ Utf8 characters are handled for now, this is also related to the issue #9728
   > (run foo %{bin:é})
   > EOF
   File "", line 1, characters 15-16:
-  Error: The character é is not allowed inside %{...} forms
+  Error: The character '\195' is not allowed inside %{...} forms
   [1]
 
   $ dune format-dune-file <<EOF

From bafb44323a9bb5a40251f3354ab08d6dddc7149c Mon Sep 17 00:00:00 2001
From: Etienne Millon <me@emillon.org>
Date: Tue, 19 Mar 2024 11:33:42 +0100
Subject: [PATCH 3/4] Use uutf

Also leaving a note about using the stdlib later.

Signed-off-by: Etienne Millon <me@emillon.org>
---
 boot/libs.ml                                  |  2 +-
 src/dune_sexp/dune                            |  2 +-
 src/dune_sexp/escape.ml                       | 96 +++++++------------
 src/dune_sexp/escape.mli                      |  5 -
 .../formatting/non-ascii-characters.t         |  5 +
 5 files changed, 40 insertions(+), 70 deletions(-)

diff --git a/boot/libs.ml b/boot/libs.ml
index 6404a823ab1..8d5905b1f66 100644
--- a/boot/libs.ml
+++ b/boot/libs.ml
@@ -14,6 +14,7 @@ let local_libraries =
   ; ("vendor/fiber/src", Some "Fiber", false, None)
   ; ("src/dune_console", Some "Dune_console", false, None)
   ; ("src/memo", Some "Memo", false, None)
+  ; ("vendor/uutf", None, false, None)
   ; ("src/dune_sexp", Some "Dune_sexp", false, None)
   ; ("src/ocaml-config", Some "Ocaml_config", false, None)
   ; ("src/ocaml", Some "Ocaml", false, None)
@@ -23,7 +24,6 @@ let local_libraries =
   ; ("otherlibs/dune-rpc/private", Some "Dune_rpc_private", false, None)
   ; ("src/dune_config", Some "Dune_config", false, None)
   ; ("vendor/sha", None, false, None)
-  ; ("vendor/uutf", None, false, None)
   ; ("vendor/opam/src/core", None, false, None)
   ; ("vendor/opam-file-format", None, false, None)
   ; ("vendor/opam/src/format", None, false, None)
diff --git a/src/dune_sexp/dune b/src/dune_sexp/dune
index 4b661198c6e..d8508fcd1ab 100644
--- a/src/dune_sexp/dune
+++ b/src/dune_sexp/dune
@@ -1,7 +1,7 @@
 (library
  (name dune_sexp)
  (synopsis "[Internal] S-expression library")
- (libraries stdune)
+ (libraries stdune dune_uutf)
  (instrumentation
   (backend bisect_ppx)))
 
diff --git a/src/dune_sexp/escape.ml b/src/dune_sexp/escape.ml
index 2b00288ec73..f262fb29ed2 100644
--- a/src/dune_sexp/escape.ml
+++ b/src/dune_sexp/escape.ml
@@ -1,60 +1,34 @@
 open! Stdune
 
-module Utf8 = struct
-  (*
-     The first byte of an utf8 character gives the size in bytes of the utf8:
+(** Note: on OCaml >= 4.14, this can be switched to the following (and the
+    dependency to [Uutf] can be removed)
 
-     0xxxxxxxx -> 1
-     110xxxxxx -> 2
-     1110xxxxx -> 3
-     11110xxxx -> 4
-  *)
-  let utf8_byte_length u =
-    match Char.code u with
-    | u when u < 128 -> 1
-    | u when u < 194 -> 0
-    | u when u < 224 -> 2
-    | u when u < 240 -> 3
-    | u when u < 245 -> 4
-    | _ -> 0
-  ;;
-
-  let unsafe_get s i = String.unsafe_get s i |> Char.code
-  let next_utf8_length s i = utf8_byte_length (String.unsafe_get s i)
-
-  let is_utf8_valid s i l =
-    assert (String.length s >= l);
-    match l with
-    | 1 -> true
-    | 2 ->
-      let b1 = unsafe_get s (i + 1) in
-      if b1 lsr 6 != 0b10 then false else true
-    | 3 ->
-      let b0 = unsafe_get s i in
-      let b1 = unsafe_get s (i + 1) in
-      let b2 = unsafe_get s (i + 2) in
-      if b2 lsr 6 != 0b10
-      then false
+    {[
+      let next_valid_utf8_length s i =
+        let decode = String.get_utf_8_uchar s i in
+        Option.some_if (Uchar.utf_decode_is_valid decode) (Uchar.utf_decode_length decode)
+      ;;
+    ]} *)
+let next_valid_utf8_uchar_len s i =
+  let pos = ref i in
+  let buf = Bytes.create 1 in
+  let decoder = Uutf.decoder ~encoding:`UTF_8 `Manual in
+  let rec go () =
+    match Uutf.decode decoder with
+    | `Await ->
+      if !pos >= String.length s
+      then None
       else (
-        match b0 with
-        | 0xE0 -> if b1 < 0xA0 || 0xBF < b1 then false else true
-        | 0xED -> if b1 < 0x80 || 0x9F < b1 then false else true
-        | _ -> if b1 lsr 6 != 0b10 then false else true)
-    | 4 ->
-      let b0 = unsafe_get s i in
-      let b1 = unsafe_get s (i + 1) in
-      let b2 = unsafe_get s (i + 2) in
-      let b3 = unsafe_get s (i + 3) in
-      if b3 lsr 6 != 0b10 || b2 lsr 6 != 0b10
-      then false
-      else (
-        match b0 with
-        | 0xF0 -> if b1 < 0x90 || 0xBF < b1 then false else true
-        | 0xF4 -> if b1 < 0x80 || 0x8F < b1 then false else true
-        | _ -> if b1 lsr 6 != 0b10 then false else true)
-    | _ -> false
-  ;;
-end
+        Bytes.set buf 0 (String.get s !pos);
+        incr pos;
+        Uutf.Manual.src decoder buf 0 1;
+        go ())
+    | `Uchar _ -> Some (!pos - i)
+    | `Malformed _ -> None
+    | `End -> Code_error.raise "next_valid_utf8_uchar: `End" []
+  in
+  go ()
+;;
 
 let quote_length s =
   let n = ref 0 in
@@ -69,13 +43,11 @@ let quote_length s =
         | '%' -> if !i + 1 < len && s.[!i + 1] = '{' then 2 else 1
         | ' ' .. '~' -> 1
         | _ ->
-          let uchar_len = Utf8.next_utf8_length s !i in
-          (match Utf8.is_utf8_valid s !i uchar_len with
-           | true ->
-             assert (uchar_len > 1 && uchar_len < 5);
+          (match next_valid_utf8_uchar_len s !i with
+           | Some uchar_len ->
              i := !i + uchar_len - 1;
              uchar_len
-           | false -> 4));
+           | None -> 4));
     incr i
   done;
   !n
@@ -113,10 +85,8 @@ let escape_to s ~dst:s' ~ofs =
        Bytes.unsafe_set s' !n '%'
      | ' ' .. '~' as c -> Bytes.unsafe_set s' !n c
      | c ->
-       let uchar_len = Utf8.next_utf8_length s !i in
-       (match Utf8.is_utf8_valid s !i uchar_len with
-        | true ->
-          assert (uchar_len > 1 && uchar_len < 5);
+       (match next_valid_utf8_uchar_len s !i with
+        | Some uchar_len ->
           Bytes.unsafe_set s' !n (String.unsafe_get s !i);
           Bytes.unsafe_set s' (!n + 1) (String.unsafe_get s (!i + 1));
           if uchar_len > 2
@@ -125,7 +95,7 @@ let escape_to s ~dst:s' ~ofs =
           then Bytes.unsafe_set s' (!n + 3) (String.unsafe_get s (!i + 3));
           n := !n + uchar_len - 1;
           i := !i + uchar_len - 1
-        | false ->
+        | None ->
           let a = Char.code c in
           Bytes.unsafe_set s' !n '\\';
           incr n;
diff --git a/src/dune_sexp/escape.mli b/src/dune_sexp/escape.mli
index 1fe64470013..f4389a59735 100644
--- a/src/dune_sexp/escape.mli
+++ b/src/dune_sexp/escape.mli
@@ -1,7 +1,2 @@
-module Utf8 : sig
-  val next_utf8_length : string -> int -> int
-  val is_utf8_valid : string -> int -> int -> bool
-end
-
 val escaped : string -> string
 val quoted : string -> string
diff --git a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
index 923931b913a..7b444ba7d75 100644
--- a/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
+++ b/test/blackbox-tests/test-cases/formatting/non-ascii-characters.t
@@ -28,3 +28,8 @@ Utf8 characters are handled for now, this is also related to the issue #9728
   File "", line 1, characters 1-1:
   Error: Invalid . file
   [1]
+
+  $ bash -c "printf '(echo \"%b\")' '\xc0'"| dune format-dune-file
+  (echo "\192")
+  $ bash -c "printf '(echo \"%b\")' '\xf0'"| dune format-dune-file
+  (echo "\240")

From f413a42e946d12b4624f5ff0f4dbfac33d0839de Mon Sep 17 00:00:00 2001
From: Etienne Millon <me@emillon.org>
Date: Tue, 19 Mar 2024 13:10:15 +0100
Subject: [PATCH 4/4] changelog

Signed-off-by: Etienne Millon <me@emillon.org>
---
 doc/changes/10113.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/changes/10113.md b/doc/changes/10113.md
index 978357860da..dff4f391fd0 100644
--- a/doc/changes/10113.md
+++ b/doc/changes/10113.md
@@ -1,2 +1,2 @@
-- fix: handle utf8 characters in the dune files. (#10113, fixes #9728,
-  @moyodiallo)
+- dune file formatting: output utf8 if input is correctly encoded (#10113,
+  fixes #9728, @moyodiallo)