TeamHG-Memex · kmike · Sep 25, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/.gitignore b/.gitignore
@@ -44,6 +44,7 @@ nosetests.xml
 coverage.xml
 *,cover
 .hypothesis/
+.pytest_cache
 
 # Translations
 *.mo

diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,20 @@
 History
 =======
 
+0.4.0 TDB
+------------------
+
+This is a backwards-incompatible release: by default html_text functions
+now add newlines after elements, if appropriate, to make the extracted text
+to look more like how it is rendered in a browser.
+
+To turn it off, pass ``guess_layout=False`` option to html_text functions.
+
+* ``guess_layout`` option to to make extracted text look more like how
+  it is rendered in browser.
+* Add tests of layout extraction for real webpages.
+
+
 0.3.0 (2017-10-12)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -25,10 +25,12 @@ How is html_text different from ``.xpath('//text()')`` from LXML
 or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
-It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
-adding spaces around inline elements too
+It normalizes whitespace, but is also smarter than
+``.xpath('normalize-space())``, adding spaces around inline elements
 (which are often used as block elements in html markup),
-and tries to avoid adding extra spaces for punctuation.
+tries to avoid adding extra spaces for punctuation and
+can add newlines so that the output text looks like how it is rendered in
+browsers.
 
 Apart from just getting text from the page (e.g. for display or search),
 one intended usage of this library is for machine learning (feature extraction).
@@ -56,26 +58,58 @@ Usage
 Extract text from HTML::
 
     >>> import html_text
-    >>> text = html_text.extract_text(u'<h1>Hey</h1>')
-    u'Hey'
+    >>> html_text.extract_text('<h1>Hello</h1> world!')
+    'Hello\n\nworld!'
+
+    >>> html_text.extract_text('<h1>Hello</h1> world!', guess_layout=False)
+    'Hello world!'
+
+
 
 You can also pass already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
-    >>> tree = html_text.parse_html(u'<h1>Hey</h1>')
-    >>> text = html_text.extract_text(tree)
-    u'Hey'
+    >>> tree = html_text.parse_html('<h1>Hello</h1> world!')
+    >>> html_text.extract_text(tree)
+    'Hello\n\nworld!'
 
-Passed html will be first cleaned from invisible non-text content such
-as styles, and then text would be extracted.
-Two functions that do it are ``html_text.cleaned_selector`` and
-``html_text.selector_to_text``:
+Or define a selector to extract text only from specific elements:
 
-* ``html_text.cleaned_selector`` accepts html as text or as ``lxml.html.HtmlElement``,
-  and returns cleaned ``parsel.Selector``.
-* ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted
-  text.
+    >>> import html_text
+    >>> sel = html_text.cleaned_selector('<h1>Hello</h1> world!')
+    >>> subsel = sel.xpath('//h1')
+    >>> html_text.selector_to_text(subsel)
+    'Hello'
 
+Passed html will be first cleaned from invisible non-text content such
+as styles, and then text would be extracted.
+NB Selectors are not cleaned automatically you need to call
+``html_text.cleaned_selector`` first.
+
+Main functions:
+
+* ``html_text.extract_text`` accepts html and returns extracted text.
+* ``html_text.cleaned_selector`` accepts html as text or as
+  ``lxml.html.HtmlElement``, and returns cleaned ``parsel.Selector``.
+* ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns
+  extracted text.
+
+If ``guess_layout`` is True (default), a newline is added before and after
+``newline_tags``, and two newlines are added before and after
+``double_newline_tags``. This heuristic makes the extracted text
+more similar to how it is rendered in the browser. Default newline and double
+newline tags can be found in `html_text.NEWLINE_TAGS`
+and `html_text.DOUBLE_NEWLINE_TAGS`.
+
+It is possible to customize how newlines are added, using ``newline_tags`` and
+``double_newline_tags`` arguments (which are `html_text.NEWLINE_TAGS` and
+`html_text.DOUBLE_NEWLINE_TAGS` by default). For example, don't add a newline
+after ``<div>`` tags:
+
+    >>> newline_tags = html_text.NEWLINE_TAGS - {'div'}
+    >>> html_text.extract_text('<div>Hello</div> world!',
+    ...                        newline_tags=newline_tags)
+    'Hello world!'
 
 Credits
 -------

diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from .html_text import (extract_text, parse_html, cleaned_selector,
+                        selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -7,6 +7,16 @@
 import parsel
 
 
+NEWLINE_TAGS = frozenset([
+    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
+    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
+    'nav', 'table', 'tr'
+])
+DOUBLE_NEWLINE_TAGS = frozenset([
+    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
+    'p', 'pre', 'title', 'ul'
+])
+
 _clean_html = Cleaner(
     scripts=True,
     javascript=False,  # onclick attributes are fine
@@ -43,31 +53,105 @@ def parse_html(html):
 
 _whitespace = re.compile(r'\s+')
 _has_trailing_whitespace = re.compile(r'\s$').search
-_has_punct_after = re.compile(r'^[,:;.!?"\)]').search
-_has_punct_before = re.compile(r'\($').search
-
+_has_punct_after = re.compile(r'^[,:;.!?")]').search
+_has_open_bracket_before = re.compile(r'\($').search
 
-def selector_to_text(sel, guess_punct_space=True):
-    """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
-    """
-    if guess_punct_space:
 
-        def fragments():
-            prev = None
-            for text in sel.xpath('.//text()').extract():
-                if prev is not None and (_has_trailing_whitespace(prev)
-                                         or (not _has_punct_after(text) and
-                                             not _has_punct_before(prev))):
-                    yield ' '
-                yield text
-                prev = text
+def _normalize_whitespace(text):
+    return _whitespace.sub(' ', text.strip())
 
-        return _whitespace.sub(' ', ''.join(fragments()).strip())
 
+def _html_to_text(tree,
+                  guess_punct_space=True,
+                  guess_layout=True,
+                  newline_tags=NEWLINE_TAGS,
+                  double_newline_tags=DOUBLE_NEWLINE_TAGS):
+    """
+    Convert a cleaned html tree to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
+    """
+    chunks = []
+
+    _NEWLINE = object()
+    _DOUBLE_NEWLINE = object()
+
+    class Context:
+        """ workaround for missing `nonlocal` in Python 2 """
+        # _NEWLINE, _DOUBLE_NEWLINE or content of the previous chunk (str)
+        prev = _DOUBLE_NEWLINE
+
+    def should_add_space(text, prev):
+        """ Return True if extra whitespace should be added before text """
+        if prev in {_NEWLINE, _DOUBLE_NEWLINE}:
+            return False
+        if not _has_trailing_whitespace(prev):
+            if _has_punct_after(text) or _has_open_bracket_before(prev):
+                return False
+        return True
+
+    def get_space_between(text, prev):
+        if not text or not guess_punct_space:
+            return ' '
+        return ' ' if should_add_space(text, prev) else ''
+
+    def add_newlines(tag, context):
+        if not guess_layout:
+            return
+        prev = context.prev
+        if prev is _DOUBLE_NEWLINE:  # don't output more than 1 blank line
+            return
+        if tag in double_newline_tags:
+            context.prev = _DOUBLE_NEWLINE
+            chunks.append('\n' if prev is _NEWLINE else '\n\n')
+        elif tag in newline_tags:
+            context.prev = _NEWLINE
+            if prev is not _NEWLINE:
+                chunks.append('\n')
+
+    def add_text(text_content, context):
+        text = _normalize_whitespace(text_content) if text_content else ''
+        if not text:
+            return
+        space = get_space_between(text, context.prev)
+        chunks.extend([space, text])
+        context.prev = text_content
+
+    def traverse_text_fragments(tree, context, handle_tail=True):
+        """ Extract text from the ``tree``: fill ``chunks`` variable """
+        add_newlines(tree.tag, context)
+        add_text(tree.text, context)
+        for child in tree:
+            traverse_text_fragments(child, context)
+        add_newlines(tree.tag, context)
+        if handle_tail:
+            add_text(tree.tail, context)
+
+    traverse_text_fragments(tree, context=Context(), handle_tail=False)
+    return ''.join(chunks).strip()
+
+
+def selector_to_text(sel, guess_punct_space=True, guess_layout=True):
+    """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
+    """
+    if isinstance(sel, parsel.SelectorList):
+        # if selecting a specific xpath
+        text = []
+        for s in sel:
+            extracted = _html_to_text(
+                s.root,
+                guess_punct_space=guess_punct_space,
+                guess_layout=guess_layout)
+            if extracted:
+                text.append(extracted)
+        return ' '.join(text)
     else:
-        fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
+        return _html_to_text(
+            sel.root,
+            guess_punct_space=guess_punct_space,
+            guess_layout=guess_layout)
 
 
 def cleaned_selector(html):
@@ -85,18 +169,40 @@ def cleaned_selector(html):
     return sel
 
 
-def extract_text(html, guess_punct_space=True):
+def extract_text(html,
+                 guess_punct_space=True,
+                 guess_layout=True,
+                 newline_tags=NEWLINE_TAGS,
+                 double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
     Convert html to text, cleaning invisible content such as styles.
+
     Almost the same as normalize-space xpath, but this also
     adds spaces between inline elements (like <span>) which are
-    often used as block elements in html markup.
+    often used as block elements in html markup, and adds appropriate
+    newlines to make output better formatted.
+
+    html should be a unicode string or an already parsed lxml.html element.
 
     When guess_punct_space is True (default), no extra whitespace is added
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
 
-    html should be a unicode string or an already parsed lxml.html element.
+    When guess_layout is True (default), a newline is added
+    before and after ``newline_tags`` and two newlines are added before
+    and after ``double_newline_tags``. This heuristic makes the extracted
+    text more similar to how it is rendered in the browser.
+
+    Default newline and double newline tags can be found in
+    `html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.
     """
-    sel = cleaned_selector(html)
-    return selector_to_text(sel, guess_punct_space=guess_punct_space)
+    if html is None or len(html) == 0:
+        return ''
+    cleaned = _cleaned_html_tree(html)
+    return _html_to_text(
+        cleaned,
+        guess_punct_space=guess_punct_space,
+        guess_layout=guess_layout,
+        newline_tags=newline_tags,
+        double_newline_tags=double_newline_tags,
+    )