TeamHG-Memex · kmike · Sep 25, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -2,6 +2,14 @@
 History
 =======
 
+0.4.0 TDB
+------------------
+
+* Add ``guess_page_layout`` to make extracted text look like how it is rendered
+  in browser.
+* Add tests of layout extraction for real webpages.
+
+
 0.3.0 (2017-10-12)
 ------------------
 

diff --git a/README.rst b/README.rst
@@ -26,9 +26,10 @@ or ``.get_text()`` from Beautiful Soup?
 Text extracted with ``html_text`` does not contain inline styles,
 javascript, comments and other text that is not normally visible to the users.
 It normalizes whitespace, but is also smarter than ``.xpath('normalize-space())``,
-adding spaces around inline elements too
-(which are often used as block elements in html markup),
-and tries to avoid adding extra spaces for punctuation.
+adding spaces around inline elements (which are often used as block
+elements in html markup), tries to avoid adding extra spaces for punctuation and
+can add newlines so that the output text looks like how it is rendered in
+browsers.
 
 Apart from just getting text from the page (e.g. for display or search),
 one intended usage of this library is for machine learning (feature extraction).
@@ -56,26 +57,59 @@ Usage
 Extract text from HTML::
 
     >>> import html_text
-    >>> text = html_text.extract_text(u'<h1>Hey</h1>')
-    u'Hey'
+    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!')
+    u'Hello world!'
+
+    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
+    u'Hello
+    world!'
 
 You can also pass already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
-    >>> tree = html_text.parse_html(u'<h1>Hey</h1>')
+    >>> tree = html_text.parse_html(u'<h1>Hello</h1> world!')
     >>> text = html_text.extract_text(tree)
-    u'Hey'
+    u'Hello world!'
+
+Or define a selector to extract text only from specific elements:
+
+    >>> import html_text
+    >>> sel = html_text.cleaned_selector(u'<h1>Hello</h1> world!')
+    >>> subsel = sel.xpath('//h1')
+    >>> text = html_text.selector_to_text(subsel)
+    u'Hello'
 
 Passed html will be first cleaned from invisible non-text content such
 as styles, and then text would be extracted.
-Two functions that do it are ``html_text.cleaned_selector`` and
+NB Selectors are not cleaned automatically you need to call
+``html_text.cleaned_selector`` first.
+
+The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector`` and
 ``html_text.selector_to_text``:
 
+* ``html_text.extract_text`` accepts html and returns extracted text.
 * ``html_text.cleaned_selector`` accepts html as text or as ``lxml.html.HtmlElement``,
   and returns cleaned ``parsel.Selector``.
 * ``html_text.selector_to_text`` accepts ``parsel.Selector`` and returns extracted
   text.
 
+If ``guess_page_layout`` is True (False by default for backward compatibility),
+a newline is added before and after NEWLINE_TAGS and two newlines are added
+before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text
+more similar to how it is rendered in the browser.
+NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of
+the tags that are handled by default:
+
+* NEWLINE_TAGS = frozenset([
+    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
+    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
+    'nav', 'table', 'tr'
+])
+* DOUBLE_NEWLINE_TAGS = frozenset([
+    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
+    'p', 'pre', 'title', 'ul'
+])
+
 
 Credits
 -------

diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,3 +1,4 @@
 # -*- coding: utf-8 -*-
 
-from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text
+from .html_text import (extract_text, parse_html, cleaned_selector,
+                        selector_to_text)
diff --git a/html_text/html_text.py b/html_text/html_text.py
@@ -6,6 +6,15 @@
 from lxml.html.clean import Cleaner
 import parsel
 
+NEWLINE_TAGS = frozenset([
+    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
+    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
+    'nav', 'table', 'tr'
+])
+DOUBLE_NEWLINE_TAGS = frozenset([
+    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
+    'p', 'pre', 'title', 'ul'
+])
 
 _clean_html = Cleaner(
     scripts=True,
@@ -44,30 +53,108 @@ def parse_html(html):
 _whitespace = re.compile(r'\s+')
 _has_trailing_whitespace = re.compile(r'\s$').search
 _has_punct_after = re.compile(r'^[,:;.!?"\)]').search
-_has_punct_before = re.compile(r'\($').search
+_has_open_bracket_before = re.compile(r'\($').search
 
 
-def selector_to_text(sel, guess_punct_space=True):
-    """ Convert a cleaned selector to text.
-    See html_text.extract_text docstring for description of the approach and options.
+def _html_to_text(tree,
+                  guess_punct_space=True,
+                  guess_page_layout=False,
+                  newline_tags=NEWLINE_TAGS,
+                  double_newline_tags=DOUBLE_NEWLINE_TAGS):
+    """
+    Convert a cleaned html tree to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
     """
-    if guess_punct_space:
-
-        def fragments():
-            prev = None
-            for text in sel.xpath('.//text()').extract():
-                if prev is not None and (_has_trailing_whitespace(prev)
-                                         or (not _has_punct_after(text) and
-                                             not _has_punct_before(prev))):
-                    yield ' '
-                yield text
-                prev = text
-
-        return _whitespace.sub(' ', ''.join(fragments()).strip())
 
+    def add_space(text, prev):
+        if prev is None:
+            return False
+        if prev == '\n' or prev == '\n\n':
+            return False
+        if not _has_trailing_whitespace(prev):
+            if _has_punct_after(text) or _has_open_bracket_before(prev):
+                return False
+        return True
+
+    def add_newline(tag, prev):
+        if prev is None or prev == '\n\n':
+            return '', '\n\n'
+        if tag in double_newline_tags:
+            if prev == '\n':
+                return '\n', '\n\n'
+            return '\n\n', '\n\n'
+        if tag in newline_tags:
+            if prev == '\n':
+                return '', prev
+            return '\n', '\n'
+        return '', prev
+
+    def traverse_text_fragments(tree, prev, depth):
+        space = ' '
+        newline = ''
+        text = ''
+        if guess_page_layout:
+            newline, prev[0] = add_newline(tree.tag, prev[0])
+        if tree.text:
+            text = _whitespace.sub(' ', tree.text.strip())
+            if text and guess_punct_space and not add_space(text, prev[0]):
+                space = ''
+        if text:
+            yield [newline, space, text]
+            prev[0] = tree.text
+            space = ' '
+            newline = ''
+        elif newline:
+            yield [newline]
+            newline = ''
+
+        for child in tree:
+            for t in traverse_text_fragments(child, prev, depth + 1):
+                yield t
+
+        if guess_page_layout:
+            newline, prev[0] = add_newline(tree.tag, prev[0])
+
+        tail = ''
+        if tree.tail and depth != 0:
+            tail = _whitespace.sub(' ', tree.tail.strip())
+            if tail:
+                if guess_punct_space and not add_space(tail, prev[0]):
+                    space = ''
+        if tail:
+            yield [newline, space, tail]
+            prev[0] = tree.tail
+        elif newline:
+            yield [newline]
+
+    text = []
+    for fragment in traverse_text_fragments(tree, [None], 0):
+        text.extend(fragment)
+    return ''.join(text).strip()
+
+
+def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False):
+    """ Convert a cleaned selector to text.
+    See html_text.extract_text docstring for description of the approach
+    and options.
+    """
+    if isinstance(sel, list):
+        # if selecting a specific xpath
+        text = []
+        for t in sel:
+            extracted = _html_to_text(
+                t.root,
+                guess_punct_space=guess_punct_space,
+                guess_page_layout=guess_page_layout)
+            if extracted:
+                text.append(extracted)
+        return ' '.join(text)
     else:
-        fragments = (x.strip() for x in sel.xpath('.//text()').extract())
-        return _whitespace.sub(' ', ' '.join(x for x in fragments if x))
+        return _html_to_text(
+            sel.root,
+            guess_punct_space=guess_punct_space,
+            guess_page_layout=guess_page_layout)
 
 
 def cleaned_selector(html):
@@ -76,16 +163,18 @@ def cleaned_selector(html):
     try:
         tree = _cleaned_html_tree(html)
         sel = parsel.Selector(root=tree, type='html')
-    except (lxml.etree.XMLSyntaxError,
-            lxml.etree.ParseError,
-            lxml.etree.ParserError,
-            UnicodeEncodeError):
+    except (lxml.etree.XMLSyntaxError, lxml.etree.ParseError,
+            lxml.etree.ParserError, UnicodeEncodeError):
         # likely plain text
         sel = parsel.Selector(html)
     return sel
 
 
-def extract_text(html, guess_punct_space=True):
+def extract_text(html,
+                 guess_punct_space=True,
+                 guess_page_layout=False,
+                 newline_tags=NEWLINE_TAGS,
+                 double_newline_tags=DOUBLE_NEWLINE_TAGS):
     """
     Convert html to text, cleaning invisible content such as styles.
     Almost the same as normalize-space xpath, but this also
@@ -96,7 +185,22 @@ def extract_text(html, guess_punct_space=True):
     for punctuation. This has a slight (around 10%) performance overhead
     and is just a heuristic.
 
+    When guess_page_layout is True (default is False), a newline is added
+    before and after NEWLINE_TAGS and two newlines are added before and after
+    DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text more similar
+    to how it is rendered in the browser.
+
+    NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized.
+
     html should be a unicode string or an already parsed lxml.html element.
     """
-    sel = cleaned_selector(html)
-    return selector_to_text(sel, guess_punct_space=guess_punct_space)
+    if html is None or len(html) == 0:
+        return ''
+    cleaned = _cleaned_html_tree(html)
+    return _html_to_text(
+        cleaned,
+        guess_punct_space=guess_punct_space,
+        guess_page_layout=guess_page_layout,
+        newline_tags=newline_tags,
+        double_newline_tags=double_newline_tags,
+    )