Merge pull request #3 from facelessuser/python-group-comments

Add option to group consecutive Python comments
facelessuser · Dec 20, 2017 · 00b36e3 · 00b36e3
2 parents 680d4f7 + 6896e53
commit 00b36e3
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 20 deletions.
diff --git a/docs/src/markdown/changelog.md b/docs/src/markdown/changelog.md
@@ -2,6 +2,7 @@
 
 ## 0.1.0a2
 
+- **NEW**: Add option to group consecutive Python comments.
 - **FIX**: Properly return error.
 - **FIX**: Only retry with default encoding if exception thrown was a `UnicodeDecodeError`.
 

diff --git a/docs/src/markdown/parsers.md b/docs/src/markdown/parsers.md
@@ -22,9 +22,9 @@ This is a parser that parsers general text files to Unicode.  It takes a file an
 
 The Markdown parser converts a text file using Python Markdown and returns a single `SourceText` object containing HTML text. It can be included via `pyspelling.parsers.markdown_parser`.
 
-Options               | Type          | Description
---------------------- | ------------- | -----------
-`markdown_extensions` | [string/dict] | A list of strings defining markdown extensions to use. You can substitute the string with a dict that defines the extension as the key and the value as a dictionary of options.
+Options               | Type          | Default    | Description
+--------------------- | ------------- | ---------- | -----------
+`markdown_extensions` | [string/dict] | `#!py3 []` | A list of strings defining markdown extensions to use. You can substitute the string with a dict that defines the extension as the key and the value as a dictionary of options.
 
 ```yaml
 - name: Markdown
@@ -43,11 +43,11 @@ Options               | Type          | Description
 
 The HTML parsers will look for the encoding of the HTML in its header and convert the buffer to Unicode.  It then uses BeautifulSoup4 to convert the content to HTML, and then aggregates all text that should be spell checked in a single `SourceText` object.  It can be configured to avoid certain tags, classes, IDs, or other attributes if desired.  It can also be instructed to scan certain tag attributes for content to spell check. It can be included via `pyspelling.parsers.html_parser`.
 
-Options      | Type     | Description
------------- | -------- | -----------
-`comments`   | bool     | Include comment text in the output.
-`attributes` | [string] | Attributes whose content should be included in the output.
-`ignores`    | [string] | Simple selectors that identify tags to ignore. Only allows tags, IDs, classes, and other attributes.
+Options      | Type     | Default      | Description
+------------ | -------- | ------------ | -----------
+`comments`   | bool     | `#!py3 True` | Include comment text in the output.
+`attributes` | [string] | `#!py3 []`   | Attributes whose content should be included in the output.
+`ignores`    | [string] | `#!py3 []`   | Simple selectors that identify tags to ignore. Only allows tags, IDs, classes, and other attributes.
 
 ```yaml
 - name: mkdocs
@@ -79,12 +79,13 @@ The Python parser will look for the encoding of the file in the header, and conv
 
 Text is returned in blocks based on the context of the text.  Each docstring is returned as its own object.  Comments are returned as their own as well as strings. This is in case you do something like write your docstrings in Markdown, you can run each one individually through the Markdown filter, or some other filter if required.
 
-Options      | Type | Description
------------- | ---- | -----------
-`strings`    | bool | Return `SourceText` entries for each string.
-`comments`   | bool | Return `SourceText` entries for each comment.
-`docstrings` | bool | Return `SourceText` entries for each docstrings.
-`bytes`      | bool | Return `SourceText` entries for each byte string. Only ASCII content will be included, and encoding will be returned as ASCII.
+Options          | Type | Default       | Description
+---------------- | ---- | ------------- | -----------
+`comments`       | bool | `#!py3 True`  | Return `SourceText` entries for each comment.
+`docstrings`     | bool | `#!py3 True`  | Return `SourceText` entries for each docstrings.
+`strings`        | bool | `#!py3 False` | Return `SourceText` entries for each string.
+`bytes`          | bool | `#!py3 False` | Return `SourceText` entries for each byte string. Only ASCII content will be included, and encoding will be returned as ASCII.
+`group_comments` | bool | `#!py3 False` | Group consecutive Python comments as one `SourceText` entry.
 
 ```yaml
 - name: python

diff --git a/pyspelling/parsers/python_parser.py b/pyspelling/parsers/python_parser.py
@@ -58,8 +58,9 @@ def __init__(self, options, default_encoding='ascii'):
 
         self.comments = options.get('comments', True) is True
         self.docstrings = options.get('docstrings', True) is True
-        self.strings = options.get('strings', True) is True
+        self.strings = options.get('strings', False) is True
         self.bytes = options.get('bytes', False) is True
+        self.group_comments = options.get('group_comments', False) is True
         super(PythonParser, self).__init__(options, default_encoding)
 
     def is_py2_unicode_literals(self, text, source_file):
@@ -110,6 +111,7 @@ def parse_docstrings(self, source_file, encoding):
                 token_type = token[0]
                 value = token[1]
                 line = util.ustr(token[2][0])
+                line_num = token[2][0]
 
                 if util.PY3 and token_type == tokenize.ENCODING:
                     # PY3 will tell us for sure what our encoding is
@@ -135,11 +137,20 @@ def parse_docstrings(self, source_file, encoding):
 
                 if token_type == tokenize.COMMENT and self.comments:
                     # Capture comments
-                    if len(stack) > 1:
-                        loc = "%s(%s): %s" % (stack[0][0], line, ''.join([crumb[0] for crumb in stack[1:]]))
+                    if (
+                        self.group_comments and
+                        prev_token_type == tokenize.NL and
+                        comments and (comments[-1][2] + 1) == line_num
+                    ):
+                        # Group multiple consecutive comments
+                        comments[-1][0] += '\n' + value[1:]
+                        comments[-1][2] = line_num
                     else:
-                        loc = "%s(%s)" % (stack[0][0], line)
-                    comments.append(parsers.SourceText(value, loc, encoding, 'comment'))
+                        if len(stack) > 1:
+                            loc = "%s(%s): %s" % (stack[0][0], line, ''.join([crumb[0] for crumb in stack[1:]]))
+                        else:
+                            loc = "%s(%s)" % (stack[0][0], line)
+                        comments.append([value[1:], loc, line_num])
                 if token_type == tokenize.STRING:
                     # Capture docstrings
                     # If previously we captured an INDENT or NEWLINE previously we probably have a docstring.
@@ -179,7 +190,11 @@ def parse_docstrings(self, source_file, encoding):
 
                 prev_token_type = token_type
 
-        return docstrings + comments + strings
+        final_comments = []
+        for comment in comments:
+            final_comments.append(parsers.SourceText(textwrap.dedent(comment[0]), comment[1], encoding, 'comment'))
+
+        return docstrings + final_comments + strings
 
     def parse_file(self, source_file, encoding):
         """Parse Python file returning content."""