update readme and add newline personalization tests

TeamHG-Memex · Sep 20, 2018 · 4772061 · 4772061
1 parent 4300fe6
commit 4772061
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 23 deletions.
diff --git a/README.rst b/README.rst
@@ -57,26 +57,35 @@ Usage
 Extract text from HTML::
 
     >>> import html_text
-    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!')
+    >>> html_text.extract_text(u'<h1>Hello</h1> world!')
     u'Hello world!'
 
-    >>> text = html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
-    u'Hello
-    world!'
+    >>> html_text.extract_text(u'<h1>Hello</h1> world!', guess_page_layout=True)
+    'Hello\n\nworld!'
+
+
+It is possible to add specific tags to `html_text.NEWLINE_TAGS` and
+`html_text.DOUBLE_NEWLINE_TAGS`:
+    >>> html_text.extract_text(
+        u'<a>Hello</a> world!',
+        guess_page_layout=True,
+        newline_tags=html_text.NEWLINE_TAGS | {'a'})
+    'Hello\n\nworld!'
+
 
 You can also pass already parsed ``lxml.html.HtmlElement``:
 
     >>> import html_text
     >>> tree = html_text.parse_html(u'<h1>Hello</h1> world!')
-    >>> text = html_text.extract_text(tree)
+    >>> html_text.extract_text(tree)
     u'Hello world!'
 
 Or define a selector to extract text only from specific elements:
 
     >>> import html_text
     >>> sel = html_text.cleaned_selector(u'<h1>Hello</h1> world!')
     >>> subsel = sel.xpath('//h1')
-    >>> text = html_text.selector_to_text(subsel)
+    >>> html_text.selector_to_text(subsel)
     u'Hello'
 
 Passed html will be first cleaned from invisible non-text content such
@@ -94,21 +103,10 @@ The main functions are ``html_text.extract_text``, ``html_text.cleaned_selector`
   text.
 
 If ``guess_page_layout`` is True (False by default for backward compatibility),
-a newline is added before and after NEWLINE_TAGS and two newlines are added
-before and after DOUBLE_NEWLINE_TAGS. This heuristic makes the extracted text
-more similar to how it is rendered in the browser.
-NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be customized, here are the lists of
-the tags that are handled by default:
-
-* NEWLINE_TAGS = frozenset([
-    'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset',
-    'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main',
-    'nav', 'table', 'tr'
-])
-* DOUBLE_NEWLINE_TAGS = frozenset([
-    'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol',
-    'p', 'pre', 'title', 'ul'
-])
+a newline is added before and after newline_tags and two newlines are added
+before and after double_newline_tags. This heuristic makes the extracted text
+more similar to how it is rendered in the browser. Default newline and double
+newline tags can be found in `html_text.NEWLINE_TAGS` and `html_text.DOUBLE_NEWLINE_TAGS`.
 
 
 Credits

diff --git a/html_text/__init__.py b/html_text/__init__.py
@@ -1,4 +1,4 @@
 # -*- coding: utf-8 -*-
 
 from .html_text import (extract_text, parse_html, cleaned_selector,
-                        selector_to_text)
+                        selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
diff --git a/tests/test_html_text.py b/tests/test_html_text.py
@@ -4,7 +4,7 @@
 import glob
 
 from html_text import (extract_text, parse_html, cleaned_selector,
-                       selector_to_text)
+                       selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS)
 
 
 @pytest.fixture(params=[{
@@ -127,6 +127,24 @@ def test_adjust_newline():
                          guess_page_layout=True) == ('text 1\n\ntext 2'))
 
 
+def test_personalize_newlines_sets():
+    html = (u'<span><span>text<a>more</a>'
+            '</span>and more text <a> and some more</a> <a></a> </span>')
+    assert (extract_text(
+        html,
+        guess_punct_space=True,
+        guess_page_layout=True,
+        newline_tags=NEWLINE_TAGS | {'a'}
+        ) == 'text\nmore\nand more text\nand some more')
+
+    assert (extract_text(
+        html,
+        guess_punct_space=True,
+        guess_page_layout=True,
+        double_newline_tags=DOUBLE_NEWLINE_TAGS | {'a'}
+        ) == 'text\n\nmore\n\nand more text\n\nand some more')
+
+
 def test_webpages():
     webpages = sorted(glob.glob('./test_webpages/*.html'))
     extracted = sorted(glob.glob('./test_webpages/*.txt'))