OCaml support (#848)

* Add support for ml (#827, WIP) TODO: use block comment instead of line comment * Add block comments and ocaml support (#827) * Remove the version information from the mirror notebooks * Remove the comment suffix in uncomment/when parsing the header * More OCaml-specific tests * Update CHANGELOG.md Co-authored-by: Quentin Fortier <[email protected]>
mwouts · Sep 8, 2021 · b553f32 · b553f32
1 parent e97819f
commit b553f32
Show file tree

Hide file tree

Showing 17 changed files with 366 additions and 49 deletions.
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
@@ -5,10 +5,11 @@ Jupytext ChangeLog
 -----------------------
 
 **Added**
-- Jupytext can be configured through the `pyproject.toml` file. Thanks to Robin Brown for this contribution! (#828)
+- Jupytext can be configured through the `pyproject.toml` file. Thanks to Robin Brown for this contribution! ([#828](https://github.com/mwouts/jupytext/issues/828))
+- Jupytext now supports OCaml files with `.ml` extension. Thanks to Quentin Fortier for getting this started ([#832](https://github.com/mwouts/jupytext/issues/832))
 
 **Fixed**
-- Added more test to make sure that notebooks can be trusted. In practice, notebooks could not be trusted in JupyterLab<3.0.13 because of the absence of cell ids (#826)
+- Added more test to make sure that notebooks can be trusted. In practice, notebooks could not be trusted in JupyterLab<3.0.13 because of the absence of cell ids ([#826](https://github.com/mwouts/jupytext/issues/826))
 
 
 1.11.5 (2021-08-31)

diff --git a/docs/languages.md b/docs/languages.md
@@ -14,6 +14,7 @@ Jupytext works with notebooks in any of the following languages:
 - Javascript
 - Julia
 - Matlab
+- OCaml
 - Octave
 - PowerShell
 - Python

diff --git a/jupytext/cell_reader.py b/jupytext/cell_reader.py
@@ -44,19 +44,31 @@
 _PY_INDENTED = re.compile(r"^\s")
 
 
-def uncomment(lines, prefix="#"):
+def uncomment(lines, prefix="#", suffix=""):
     """Remove prefix and space, or only prefix, when possible"""
-    if not prefix:
-        return lines
-    prefix_and_space = prefix + " "
-    length_prefix = len(prefix)
-    length_prefix_and_space = len(prefix_and_space)
-    return [
-        line[length_prefix_and_space:]
-        if line.startswith(prefix_and_space)
-        else (line[length_prefix:] if line.startswith(prefix) else line)
-        for line in lines
-    ]
+    if prefix:
+        prefix_and_space = prefix + " "
+        length_prefix = len(prefix)
+        length_prefix_and_space = len(prefix_and_space)
+        lines = [
+            line[length_prefix_and_space:]
+            if line.startswith(prefix_and_space)
+            else (line[length_prefix:] if line.startswith(prefix) else line)
+            for line in lines
+        ]
+
+    if suffix:
+        space_and_suffix = " " + suffix
+        length_suffix = len(suffix)
+        length_space_and_suffix = len(space_and_suffix)
+        lines = [
+            line[:-length_space_and_suffix]
+            if line.endswith(space_and_suffix)
+            else (line[:-length_suffix] if line.endswith(suffix) else line)
+            for line in lines
+        ]
+
+    return lines
 
 
 def paragraph_is_fully_commented(lines, comment, main_language):
@@ -539,14 +551,17 @@ def uncomment_code_and_magics(self, lines):
                 lines, self.ext, self.language or self.default_language
             )
 
-        return uncomment(lines, self.markdown_prefix or self.comment)
+        return uncomment(
+            lines, self.markdown_prefix or self.comment, self.comment_suffix
+        )
 
 
 class RScriptCellReader(ScriptCellReader):
     """Read notebook cells from R scripts written according
     to the knitr-spin syntax"""
 
     comment = "#'"
+    comment_suffix = ""
     markdown_prefix = "#'"
     default_language = "R"
     start_code_re = re.compile(r"^#\+(.*)\s*$")
@@ -620,6 +635,7 @@ def __init__(self, fmt=None, default_language=None):
         script = _SCRIPT_EXTENSIONS[self.ext]
         self.default_language = default_language or script["language"]
         self.comment = script["comment"]
+        self.comment_suffix = script.get("comment_suffix", "")
         self.ignore_end_marker = True
         self.explicit_end_marker_required = False
         if (
@@ -632,16 +648,29 @@ def __init__(self, fmt=None, default_language=None):
                 ",", 1
             )
             self.start_code_re = re.compile(
-                "^" + self.comment + r"\s*" + self.cell_marker_start + r"(.*)$"
+                "^"
+                + re.escape(self.comment)
+                + r"\s*"
+                + self.cell_marker_start
+                + r"(.*)$"
             )
             self.end_code_re = re.compile(
-                "^" + self.comment + r"\s*" + self.cell_marker_end + r"\s*$"
+                "^" + re.escape(self.comment) + r"\s*" + self.cell_marker_end + r"\s*$"
             )
         else:
-            self.start_code_re = re.compile("^" + self.comment + r"\s*\+(.*)$")
+            self.start_code_re = re.compile(
+                "^" + re.escape(self.comment) + r"\s*\+(.*)$"
+            )
 
     def metadata_and_language_from_option_line(self, line):
         if self.start_code_re.match(line):
+            # Remove the OCAML suffix
+            if self.comment_suffix:
+                if line.endswith(" " + self.comment_suffix):
+                    line = line[: -len(" " + self.comment_suffix)]
+                elif line.endswith(self.comment_suffix):
+                    line = line[: -len(self.comment_suffix)]
+
             # We want to parse inner most regions as cells.
             # Thus, if we find another region start before the end for this region,
             # we will have ignore the metadata that we found here, and move on to the next cell.
@@ -705,7 +734,7 @@ def find_cell_end(self, lines):
         elif not self.cell_marker_end:
             end_of_cell = self.metadata.get("endofcell", "-")
             self.end_code_re = re.compile(
-                "^" + self.comment + " " + end_of_cell + r"\s*$"
+                "^" + re.escape(self.comment) + " " + end_of_cell + r"\s*$"
             )
 
         return self.find_region_end(lines)
@@ -774,16 +803,22 @@ def __init__(self, fmt, default_language=None):
         script = _SCRIPT_EXTENSIONS[self.ext]
         self.default_language = default_language or script["language"]
         self.comment = script["comment"]
-        self.start_code_re = re.compile(r"^\s*{}\s*%%(%*)\s(.*)$".format(self.comment))
+        self.comment_suffix = script.get("comment_suffix", "")
+        self.start_code_re = re.compile(
+            r"^\s*{}\s*%%(%*)\s(.*)$".format(re.escape(self.comment))
+        )
         self.alternative_start_code_re = re.compile(
-            r"^\s*{}\s*(%%|<codecell>|In\[[0-9 ]*\]:?)\s*$".format(self.comment)
+            r"^\s*{}\s*(%%|<codecell>|In\[[0-9 ]*\]:?)\s*$".format(
+                re.escape(self.comment)
+            )
         )
         self.explicit_soc = True
 
     def metadata_and_language_from_option_line(self, line):
         """Parse code options on the given line. When a start of a code cell
         is found, self.metadata is set to a dictionary."""
         if self.start_code_re.match(line):
+            line = uncomment([line], self.comment, self.comment_suffix)[0]
             self.language, self.metadata = self.options_to_metadata(
                 line[line.find("%%") + 2 :]
             )

diff --git a/jupytext/cell_to_text.py b/jupytext/cell_to_text.py
@@ -77,6 +77,9 @@ def __init__(self, cell, default_language, fmt=None):
         self.language = self.language or cell.metadata.get("language", default_language)
         self.default_language = default_language
         self.comment = _SCRIPT_EXTENSIONS.get(self.ext, {}).get("comment", "#")
+        self.comment_suffix = _SCRIPT_EXTENSIONS.get(self.ext, {}).get(
+            "comment_suffix", ""
+        )
         self.comment_magics = self.fmt.get(
             "comment_magics", self.default_comment_magics
         )
@@ -168,7 +171,7 @@ def markdown_to_text(self, source):
                 explicitly_code=self.cell_type == "code",
             )
 
-        return comment_lines(source, self.comment)
+        return comment_lines(source, self.comment, self.comment_suffix)
 
     def code_to_text(self):
         """Return the text representation of this cell as a code cell"""
@@ -278,7 +281,9 @@ def endofcell_marker(source, comment):
     we add an end-of-cell marker"""
     endofcell = "-"
     while True:
-        endofcell_re = re.compile(r"^{}( )".format(comment) + endofcell + r"\s*$")
+        endofcell_re = re.compile(
+            r"^{}( )".format(re.escape(comment)) + endofcell + r"\s*$"
+        )
         if list(filter(endofcell_re.match, source)):
             endofcell = endofcell + "-"
         else:
@@ -492,9 +497,13 @@ def cell_to_text(self):
                     indent = left_space.groups()[0]
 
         if options.startswith("%") or not options:
-            lines = [indent + self.comment + " %%" + options]
+            lines = comment_lines(
+                ["%%" + options], indent + self.comment, self.comment_suffix
+            )
         else:
-            lines = [indent + self.comment + " %% " + options]
+            lines = comment_lines(
+                ["%% " + options], indent + self.comment, self.comment_suffix
+            )
 
         if self.is_code() and active:
             source = copy(self.source)
@@ -555,4 +564,4 @@ def cell_to_text(self):
             cell_marker
             if cell_marker.startswith("#" * 20)
             else self.default_cell_marker
-        ] + comment_lines(self.source, self.comment)
+        ] + comment_lines(self.source, self.comment, self.comment_suffix)
diff --git a/jupytext/formats.py b/jupytext/formats.py
@@ -60,11 +60,13 @@ def __init__(
         cell_reader_class,
         cell_exporter_class,
         current_version_number,
+        header_suffix="",
         min_readable_version_number=None,
     ):
         self.format_name = format_name
         self.extension = extension
         self.header_prefix = header_prefix
+        self.header_suffix = header_suffix
         self.cell_reader_class = cell_reader_class
         self.cell_exporter_class = cell_exporter_class
         self.current_version_number = current_version_number
@@ -115,6 +117,7 @@ def __init__(
             format_name="light",
             extension=ext,
             header_prefix=_SCRIPT_EXTENSIONS[ext]["comment"],
+            header_suffix=_SCRIPT_EXTENSIONS[ext].get("comment_suffix", ""),
             cell_reader_class=LightScriptCellReader,
             cell_exporter_class=LightScriptCellExporter,
             # Version 1.5 on 2019-10-19 - jupytext v1.3.0 - Cell metadata represented as key=value by default
@@ -136,6 +139,7 @@ def __init__(
             format_name="nomarker",
             extension=ext,
             header_prefix=_SCRIPT_EXTENSIONS[ext]["comment"],
+            header_suffix=_SCRIPT_EXTENSIONS[ext].get("comment_suffix", ""),
             cell_reader_class=LightScriptCellReader,
             cell_exporter_class=BareScriptCellExporter,
             current_version_number="1.0",
@@ -148,6 +152,7 @@ def __init__(
             format_name="percent",
             extension=ext,
             header_prefix=_SCRIPT_EXTENSIONS[ext]["comment"],
+            header_suffix=_SCRIPT_EXTENSIONS[ext].get("comment_suffix", ""),
             cell_reader_class=DoublePercentScriptCellReader,
             cell_exporter_class=DoublePercentCellExporter,
             # Version 1.3 on 2019-09-21 - jupytext v1.3.0: Markdown cells can be quoted using triple quotes #305
@@ -166,6 +171,7 @@ def __init__(
             format_name="hydrogen",
             extension=ext,
             header_prefix=_SCRIPT_EXTENSIONS[ext]["comment"],
+            header_suffix=_SCRIPT_EXTENSIONS[ext].get("comment_suffix", ""),
             cell_reader_class=HydrogenCellReader,
             cell_exporter_class=HydrogenCellExporter,
             # Version 1.2 on 2018-12-14 - jupytext v0.9.0: same as percent - only magics are not commented by default
@@ -297,7 +303,7 @@ def guess_format(text, ext):
     # Is this a Hydrogen-like script?
     # Or a Sphinx-gallery script?
     if ext in _SCRIPT_EXTENSIONS:
-        comment = _SCRIPT_EXTENSIONS[ext]["comment"]
+        comment = re.escape(_SCRIPT_EXTENSIONS[ext]["comment"])
         language = _SCRIPT_EXTENSIONS[ext]["language"]
         twenty_hash_re = re.compile(r"^#( |)#{19,}\s*$")
         double_percent_re = re.compile(r"^{}( %%|%%)$".format(comment))

diff --git a/jupytext/header.py b/jupytext/header.py
@@ -31,14 +31,18 @@ def insert_or_test_version_number():
     return INSERT_AND_CHECK_VERSION_NUMBER
 
 
-def uncomment_line(line, prefix):
+def uncomment_line(line, prefix, suffix=""):
     """Remove prefix (and space) from line"""
-    if not prefix:
-        return line
-    if line.startswith(prefix + " "):
-        return line[len(prefix) + 1 :]
-    if line.startswith(prefix):
-        return line[len(prefix) :]
+    if prefix:
+        if line.startswith(prefix + " "):
+            line = line[len(prefix) + 1 :]
+        elif line.startswith(prefix):
+            line = line[len(prefix) :]
+    if suffix:
+        if line.endswith(suffix + " "):
+            line = line[: -(1 + len(suffix))]
+        elif line.endswith(suffix):
+            line = line[: -len(suffix)]
     return line
 
 
@@ -133,7 +137,10 @@ def metadata_and_cell_to_header(notebook, metadata, text_format, fmt):
         ):
             header = ["<!--", ""] + header + ["", "-->"]
 
-    return comment_lines(header, text_format.header_prefix), lines_to_next_cell
+    return (
+        comment_lines(header, text_format.header_prefix, text_format.header_suffix),
+        lines_to_next_cell,
+    )
 
 
 def recursive_update(target, update):
@@ -173,7 +180,7 @@ def header_to_metadata_and_cell(
     comment = "#" if header_prefix == "#'" else header_prefix
 
     encoding_re = re.compile(
-        r"^[ \t\f]*{}.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)".format(comment)
+        r"^[ \t\f]*{}.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)".format(re.escape(comment))
     )
 
     for i, line in enumerate(lines):
@@ -203,7 +210,13 @@ def header_to_metadata_and_cell(
             if not started and not line.strip():
                 continue
 
-        line = uncomment_line(line, header_prefix)
+        # OCAML
+        if header_prefix == "(*":
+            header_suffix = "*)"
+        else:
+            header_suffix = ""
+
+        line = uncomment_line(line, header_prefix, header_suffix)
         if _HEADER_RE.match(line):
             if not started:
                 started = True

diff --git a/jupytext/languages.py b/jupytext/languages.py
@@ -58,6 +58,11 @@
     ".java": {"language": "java", "comment": "//"},
     ".groovy": {"language": "groovy", "comment": "//"},
     ".sage": {"language": "sage", "comment": "#"},
+    ".ml": {
+        "language": "ocaml",
+        "comment": "(*",
+        "comment_suffix": "*)",
+    },  # OCaml only has block comments
 }
 
 _COMMENT_CHARS = [
@@ -192,8 +197,13 @@ def cell_language(source, default_language, custom_cell_magics):
     return None, None
 
 
-def comment_lines(lines, prefix):
+def comment_lines(lines, prefix, suffix=""):
     """Return commented lines"""
     if not prefix:
         return lines
-    return [prefix + " " + line if line else prefix for line in lines]
+    if not suffix:
+        return [prefix + " " + line if line else prefix for line in lines]
+    return [
+        prefix + " " + line + " " + suffix if line else prefix + " " + suffix
+        for line in lines
+    ]
-Original file line number
+Diff line change
@@ Expand Up @@
     - Javascript
     - Julia
     - Matlab
+    - OCaml
     - Octave
     - PowerShell
     - Python
@@ Expand Down @@