From 89e28ea66f50d4281cb9f624e31566aed9d5aab1 Mon Sep 17 00:00:00 2001
From: tungol <github@tungol.org>
Date: Mon, 20 Nov 2023 20:44:33 -0800
Subject: [PATCH] Permit standalone form feed characters at the module level
 (#4021)

Co-authored-by: Stephen Morton <git@tungol.org>
Co-authored-by: Jelle Zijlstra <jelle.zijlstra@gmail.com>
---
 CHANGES.md                                    |   2 +-
 .../reference/reference_functions.rst         |   4 +-
 docs/the_black_code_style/future_style.md     |  11 +
 src/black/comments.py                         |  39 ++-
 src/black/linegen.py                          |  25 +-
 src/black/lines.py                            |  14 +-
 src/black/mode.py                             |   1 +
 src/black/nodes.py                            |   7 +
 src/black/output.py                           |  23 +-
 src/blib2to3/pgen2/driver.py                  |   2 +
 tests/data/cases/preview_form_feeds.py        | 225 ++++++++++++++++++
 11 files changed, 318 insertions(+), 35 deletions(-)
 create mode 100644 tests/data/cases/preview_form_feeds.py

diff --git a/CHANGES.md b/CHANGES.md
index 8d0f10a2f3a..4c3fbf1afc8 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -12,7 +12,7 @@
 
 ### Preview style
 
-<!-- Changes that affect Black's preview style -->
+- Standalone form feed characters at the module level are no longer removed (#4021)
 
 - Additional cases of immediately nested tuples, lists, and dictionaries are now
   indented less (#4012)
diff --git a/docs/contributing/reference/reference_functions.rst b/docs/contributing/reference/reference_functions.rst
index dd92e37a7d4..ebadf6975a7 100644
--- a/docs/contributing/reference/reference_functions.rst
+++ b/docs/contributing/reference/reference_functions.rst
@@ -149,7 +149,7 @@ Utilities
 
 .. autofunction:: black.numerics.normalize_numeric_literal
 
-.. autofunction:: black.linegen.normalize_prefix
+.. autofunction:: black.comments.normalize_trailing_prefix
 
 .. autofunction:: black.strings.normalize_string_prefix
 
@@ -168,3 +168,5 @@ Utilities
 .. autofunction:: black.strings.sub_twice
 
 .. autofunction:: black.nodes.whitespace
+
+.. autofunction:: black.nodes.make_simple_prefix
diff --git a/docs/the_black_code_style/future_style.md b/docs/the_black_code_style/future_style.md
index 428bd87ab50..f55ea5f60a9 100644
--- a/docs/the_black_code_style/future_style.md
+++ b/docs/the_black_code_style/future_style.md
@@ -296,3 +296,14 @@ s = (  # Top comment
     # Bottom comment
 )
 ```
+
+=======
+
+### Form feed characters
+
+_Black_ will now retain form feed characters on otherwise empty lines at the module
+level. Only one form feed is retained for a group of consecutive empty lines. Where
+there are two empty lines in a row, the form feed will be placed on the second line.
+
+_Black_ already retained form feed literals inside a comment or inside a string. This
+remains the case.
diff --git a/src/black/comments.py b/src/black/comments.py
index 862fc7607cc..8a0e925fdc0 100644
--- a/src/black/comments.py
+++ b/src/black/comments.py
@@ -10,6 +10,7 @@
     WHITESPACE,
     container_of,
     first_leaf_of,
+    make_simple_prefix,
     preceding_leaf,
     syms,
 )
@@ -44,6 +45,7 @@ class ProtoComment:
     value: str  # content of the comment
     newlines: int  # how many newlines before the comment
     consumed: int  # how many characters of the original leaf's prefix did we consume
+    form_feed: bool  # is there a form feed before the comment
 
 
 def generate_comments(leaf: LN) -> Iterator[Leaf]:
@@ -65,8 +67,12 @@ def generate_comments(leaf: LN) -> Iterator[Leaf]:
     Inline comments are emitted as regular token.COMMENT leaves.  Standalone
     are emitted with a fake STANDALONE_COMMENT token identifier.
     """
+    total_consumed = 0
     for pc in list_comments(leaf.prefix, is_endmarker=leaf.type == token.ENDMARKER):
-        yield Leaf(pc.type, pc.value, prefix="\n" * pc.newlines)
+        total_consumed = pc.consumed
+        prefix = make_simple_prefix(pc.newlines, pc.form_feed)
+        yield Leaf(pc.type, pc.value, prefix=prefix)
+    normalize_trailing_prefix(leaf, total_consumed)
 
 
 @lru_cache(maxsize=4096)
@@ -79,11 +85,14 @@ def list_comments(prefix: str, *, is_endmarker: bool) -> List[ProtoComment]:
     consumed = 0
     nlines = 0
     ignored_lines = 0
-    for index, line in enumerate(re.split("\r?\n", prefix)):
-        consumed += len(line) + 1  # adding the length of the split '\n'
-        line = line.lstrip()
+    form_feed = False
+    for index, full_line in enumerate(re.split("\r?\n", prefix)):
+        consumed += len(full_line) + 1  # adding the length of the split '\n'
+        line = full_line.lstrip()
         if not line:
             nlines += 1
+            if "\f" in full_line:
+                form_feed = True
         if not line.startswith("#"):
             # Escaped newlines outside of a comment are not really newlines at
             # all. We treat a single-line comment following an escaped newline
@@ -99,13 +108,33 @@ def list_comments(prefix: str, *, is_endmarker: bool) -> List[ProtoComment]:
         comment = make_comment(line)
         result.append(
             ProtoComment(
-                type=comment_type, value=comment, newlines=nlines, consumed=consumed
+                type=comment_type,
+                value=comment,
+                newlines=nlines,
+                consumed=consumed,
+                form_feed=form_feed,
             )
         )
+        form_feed = False
         nlines = 0
     return result
 
 
+def normalize_trailing_prefix(leaf: LN, total_consumed: int) -> None:
+    """Normalize the prefix that's left over after generating comments.
+
+    Note: don't use backslashes for formatting or you'll lose your voting rights.
+    """
+    remainder = leaf.prefix[total_consumed:]
+    if "\\" not in remainder:
+        nl_count = remainder.count("\n")
+        form_feed = "\f" in remainder and remainder.endswith("\n")
+        leaf.prefix = make_simple_prefix(nl_count, form_feed)
+        return
+
+    leaf.prefix = ""
+
+
 def make_comment(content: str) -> str:
     """Return a consistently formatted comment from the given `content` string.
 
diff --git a/src/black/linegen.py b/src/black/linegen.py
index 8a2cd4710b9..7fbbe290d7e 100644
--- a/src/black/linegen.py
+++ b/src/black/linegen.py
@@ -149,7 +149,8 @@ def visit_default(self, node: LN) -> Iterator[Line]:
                     self.current_line.append(comment)
                     yield from self.line()
 
-            normalize_prefix(node, inside_brackets=any_open_brackets)
+            if any_open_brackets:
+                node.prefix = ""
             if self.mode.string_normalization and node.type == token.STRING:
                 node.value = normalize_string_prefix(node.value)
                 node.value = normalize_string_quotes(node.value)
@@ -1035,8 +1036,6 @@ def bracket_split_build_line(
         result.inside_brackets = True
         result.depth += 1
         if leaves:
-            # Since body is a new indent level, remove spurious leading whitespace.
-            normalize_prefix(leaves[0], inside_brackets=True)
             # Ensure a trailing comma for imports and standalone function arguments, but
             # be careful not to add one after any comments or within type annotations.
             no_commas = (
@@ -1106,7 +1105,7 @@ def split_wrapper(
         line: Line, features: Collection[Feature], mode: Mode
     ) -> Iterator[Line]:
         for split_line in split_func(line, features, mode):
-            normalize_prefix(split_line.leaves[0], inside_brackets=True)
+            split_line.leaves[0].prefix = ""
             yield split_line
 
     return split_wrapper
@@ -1250,24 +1249,6 @@ def append_to_line(leaf: Leaf) -> Iterator[Line]:
         yield current_line
 
 
-def normalize_prefix(leaf: Leaf, *, inside_brackets: bool) -> None:
-    """Leave existing extra newlines if not `inside_brackets`. Remove everything
-    else.
-
-    Note: don't use backslashes for formatting or you'll lose your voting rights.
-    """
-    if not inside_brackets:
-        spl = leaf.prefix.split("#")
-        if "\\" not in spl[0]:
-            nl_count = spl[-1].count("\n")
-            if len(spl) > 1:
-                nl_count -= 1
-            leaf.prefix = "\n" * nl_count
-            return
-
-    leaf.prefix = ""
-
-
 def normalize_invisible_parens(  # noqa: C901
     node: Node, parens_after: Set[str], *, mode: Mode, features: Collection[Feature]
 ) -> None:
diff --git a/src/black/lines.py b/src/black/lines.py
index 3ade0a5f4a5..ec6145ff848 100644
--- a/src/black/lines.py
+++ b/src/black/lines.py
@@ -31,6 +31,7 @@
     is_type_comment,
     is_type_ignore_comment,
     is_with_or_async_with_stmt,
+    make_simple_prefix,
     replace_child,
     syms,
     whitespace,
@@ -520,12 +521,12 @@ class LinesBlock:
     before: int = 0
     content_lines: List[str] = field(default_factory=list)
     after: int = 0
+    form_feed: bool = False
 
     def all_lines(self) -> List[str]:
         empty_line = str(Line(mode=self.mode))
-        return (
-            [empty_line * self.before] + self.content_lines + [empty_line * self.after]
-        )
+        prefix = make_simple_prefix(self.before, self.form_feed, empty_line)
+        return [prefix] + self.content_lines + [empty_line * self.after]
 
 
 @dataclass
@@ -550,6 +551,12 @@ def maybe_empty_lines(self, current_line: Line) -> LinesBlock:
         This is for separating `def`, `async def` and `class` with extra empty
         lines (two on module-level).
         """
+        form_feed = (
+            Preview.allow_form_feeds in self.mode
+            and current_line.depth == 0
+            and bool(current_line.leaves)
+            and "\f\n" in current_line.leaves[0].prefix
+        )
         before, after = self._maybe_empty_lines(current_line)
         previous_after = self.previous_block.after if self.previous_block else 0
         before = (
@@ -575,6 +582,7 @@ def maybe_empty_lines(self, current_line: Line) -> LinesBlock:
             original_line=current_line,
             before=before,
             after=after,
+            form_feed=form_feed,
         )
 
         # Maintain the semantic_leading_comment state.
diff --git a/src/black/mode.py b/src/black/mode.py
index 1aa5cbecc86..04038f49627 100644
--- a/src/black/mode.py
+++ b/src/black/mode.py
@@ -194,6 +194,7 @@ class Preview(Enum):
     allow_empty_first_line_before_new_block_or_comment = auto()
     single_line_format_skip_with_multiple_comments = auto()
     long_case_block_line_splitting = auto()
+    allow_form_feeds = auto()
 
 
 class Deprecated(UserWarning):
diff --git a/src/black/nodes.py b/src/black/nodes.py
index 9251b0defb0..de53f8e36a3 100644
--- a/src/black/nodes.py
+++ b/src/black/nodes.py
@@ -407,6 +407,13 @@ def whitespace(leaf: Leaf, *, complex_subscript: bool, mode: Mode) -> str:  # no
     return SPACE
 
 
+def make_simple_prefix(nl_count: int, form_feed: bool, empty_line: str = "\n") -> str:
+    """Generate a normalized prefix string."""
+    if form_feed:
+        return (empty_line * (nl_count - 1)) + "\f" + empty_line
+    return empty_line * nl_count
+
+
 def preceding_leaf(node: Optional[LN]) -> Optional[Leaf]:
     """Return the first leaf that precedes `node`, if any."""
     while node:
diff --git a/src/black/output.py b/src/black/output.py
index f4c17f28ea4..7c7dd0fe14e 100644
--- a/src/black/output.py
+++ b/src/black/output.py
@@ -4,8 +4,9 @@
 """
 
 import json
+import re
 import tempfile
-from typing import Any, Optional
+from typing import Any, List, Optional
 
 from click import echo, style
 from mypy_extensions import mypyc_attr
@@ -55,12 +56,28 @@ def ipynb_diff(a: str, b: str, a_name: str, b_name: str) -> str:
     return "".join(diff_lines)
 
 
+_line_pattern = re.compile(r"(.*?(?:\r\n|\n|\r|$))")
+
+
+def _splitlines_no_ff(source: str) -> List[str]:
+    """Split a string into lines ignoring form feed and other chars.
+
+    This mimics how the Python parser splits source code.
+
+    A simplified version of the function with the same name in Lib/ast.py
+    """
+    result = [match[0] for match in _line_pattern.finditer(source)]
+    if result[-1] == "":
+        result.pop(-1)
+    return result
+
+
 def diff(a: str, b: str, a_name: str, b_name: str) -> str:
     """Return a unified diff string between strings `a` and `b`."""
     import difflib
 
-    a_lines = a.splitlines(keepends=True)
-    b_lines = b.splitlines(keepends=True)
+    a_lines = _splitlines_no_ff(a)
+    b_lines = _splitlines_no_ff(b)
     diff_lines = []
     for line in difflib.unified_diff(
         a_lines, b_lines, fromfile=a_name, tofile=b_name, n=5
diff --git a/src/blib2to3/pgen2/driver.py b/src/blib2to3/pgen2/driver.py
index e629843f8b9..be3984437a8 100644
--- a/src/blib2to3/pgen2/driver.py
+++ b/src/blib2to3/pgen2/driver.py
@@ -222,6 +222,8 @@ def _partially_consume_prefix(self, prefix: str, column: int) -> Tuple[str, str]
             elif char == "\n":
                 # unexpected empty line
                 current_column = 0
+            elif char == "\f":
+                current_column = 0
             else:
                 # indent is finished
                 wait_for_nl = True
diff --git a/tests/data/cases/preview_form_feeds.py b/tests/data/cases/preview_form_feeds.py
new file mode 100644
index 00000000000..2d8653a1f04
--- /dev/null
+++ b/tests/data/cases/preview_form_feeds.py
@@ -0,0 +1,225 @@
+# flags: --preview
+
+
+# Warning! This file contains form feeds (ASCII 0x0C, often represented by \f or ^L).
+# These may be invisible in your editor: ensure you can see them before making changes here.
+
+# There's one at the start that'll get stripped
+
+# Comment and statement processing is different enough that we'll test variations of both
+# contexts here
+
+#
+
+
+#
+
+
+#
+
+
+
+#
+
+
+
+#
+
+
+
+#
+
+
+#
+
+
+
+#
+
+#
+        
+#
+
+\
+#
+pass
+
+pass
+
+
+pass
+
+
+pass
+
+
+
+pass
+
+
+
+pass
+
+
+
+pass
+
+
+pass
+
+
+
+pass
+
+pass
+        
+pass
+
+
+# form feed after a dedent
+def foo():
+    pass
+
+pass
+
+
+# form feeds are prohibited inside blocks, or on a line with nonwhitespace
+defbar(a=1,b:bool=False):
+
+    
+    pass
+
+
+class Baz:
+
+    def __init__(self):
+        pass
+    
+    
+    def something(self):
+        pass
+    
+
+
+# 
+pass
+pass #
+a = 1
+#
+pass
+a = 1
+
+a = [
+
+]
+
+# as internal whitespace of a comment is allowed but why
+"form feed literal in a string is okay"
+
+# form feeds at the very end get removed.
+
+
+
+# output
+
+# Warning! This file contains form feeds (ASCII 0x0C, often represented by \f or ^L).
+# These may be invisible in your editor: ensure you can see them before making changes here.
+
+# There's one at the start that'll get stripped
+
+# Comment and statement processing is different enough that we'll test variations of both
+# contexts here
+
+#
+
+
+#
+
+
+#
+
+
+#
+
+
+#
+
+
+#
+
+
+#
+
+
+#
+
+#
+
+#
+
+#
+pass
+
+pass
+
+
+pass
+
+
+pass
+
+
+pass
+
+
+pass
+
+
+pass
+
+
+pass
+
+
+pass
+
+pass
+
+pass
+
+
+# form feed after a dedent
+def foo():
+    pass
+
+
+pass
+
+
+# form feeds are prohibited inside blocks, or on a line with nonwhitespace
+def bar(a=1, b: bool = False):
+    pass
+
+
+class Baz:
+    def __init__(self):
+        pass
+
+    def something(self):
+        pass
+
+
+#
+pass
+pass  #
+a = 1
+#
+pass
+a = 1
+
+a = []
+
+# as internal whitespace of a comment is allowed but why
+"form feed literal in a string is okay"
+
+# form feeds at the very end get removed.