-
Notifications
You must be signed in to change notification settings - Fork 23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add guess page layout #9
Changes from all commits
0ae6d24
566dc9b
587e9a7
6c9d27e
c22f3fa
8a78fc5
a783e31
cb8dc1c
fb599bc
90e37b7
ae26d29
bb33d4b
3069a73
dd03201
0f2fb2b
e8da507
0b9d139
ba7cdc0
952d895
9dafbf0
b3229d6
695b458
03259b9
cba531f
9811349
d47138c
76f9028
05c7702
4505e24
a27e4c8
b926c8c
73f49ad
15d22e0
8f68b2c
cf02b94
7aec8d2
7653bf9
4300fe6
4772061
05b979a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from .html_text import extract_text, parse_html, cleaned_selector, selector_to_text | ||
from .html_text import (extract_text, parse_html, cleaned_selector, | ||
selector_to_text, NEWLINE_TAGS, DOUBLE_NEWLINE_TAGS) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,15 @@ | |
from lxml.html.clean import Cleaner | ||
import parsel | ||
|
||
NEWLINE_TAGS = frozenset([ | ||
'article', 'aside', 'br', 'dd', 'details', 'div', 'dt', 'fieldset', | ||
'figcaption', 'footer', 'form', 'header', 'hr', 'legend', 'li', 'main', | ||
'nav', 'table', 'tr' | ||
]) | ||
DOUBLE_NEWLINE_TAGS = frozenset([ | ||
'blockquote', 'dl', 'figure', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', | ||
'p', 'pre', 'title', 'ul' | ||
]) | ||
|
||
_clean_html = Cleaner( | ||
scripts=True, | ||
|
@@ -44,30 +53,108 @@ def parse_html(html): | |
_whitespace = re.compile(r'\s+') | ||
_has_trailing_whitespace = re.compile(r'\s$').search | ||
_has_punct_after = re.compile(r'^[,:;.!?"\)]').search | ||
_has_punct_before = re.compile(r'\($').search | ||
_has_open_bracket_before = re.compile(r'\($').search | ||
|
||
|
||
def selector_to_text(sel, guess_punct_space=True): | ||
""" Convert a cleaned selector to text. | ||
See html_text.extract_text docstring for description of the approach and options. | ||
def _html_to_text(tree, | ||
guess_punct_space=True, | ||
guess_page_layout=False, | ||
newline_tags=NEWLINE_TAGS, | ||
double_newline_tags=DOUBLE_NEWLINE_TAGS): | ||
""" | ||
Convert a cleaned html tree to text. | ||
See html_text.extract_text docstring for description of the approach | ||
and options. | ||
""" | ||
if guess_punct_space: | ||
|
||
def fragments(): | ||
prev = None | ||
for text in sel.xpath('.//text()').extract(): | ||
if prev is not None and (_has_trailing_whitespace(prev) | ||
or (not _has_punct_after(text) and | ||
not _has_punct_before(prev))): | ||
yield ' ' | ||
yield text | ||
prev = text | ||
|
||
return _whitespace.sub(' ', ''.join(fragments()).strip()) | ||
|
||
def add_space(text, prev): | ||
if prev is None: | ||
return False | ||
if prev == '\n' or prev == '\n\n': | ||
return False | ||
if not _has_trailing_whitespace(prev): | ||
if _has_punct_after(text) or _has_open_bracket_before(prev): | ||
return False | ||
return True | ||
|
||
def add_newline(tag, prev): | ||
if prev is None or prev == '\n\n': | ||
return '', '\n\n' | ||
if tag in double_newline_tags: | ||
if prev == '\n': | ||
return '\n', '\n\n' | ||
return '\n\n', '\n\n' | ||
if tag in newline_tags: | ||
if prev == '\n': | ||
return '', prev | ||
return '\n', '\n' | ||
return '', prev | ||
|
||
def traverse_text_fragments(tree, prev, depth): | ||
space = ' ' | ||
newline = '' | ||
text = '' | ||
if guess_page_layout: | ||
newline, prev[0] = add_newline(tree.tag, prev[0]) | ||
if tree.text: | ||
text = _whitespace.sub(' ', tree.text.strip()) | ||
if text and guess_punct_space and not add_space(text, prev[0]): | ||
space = '' | ||
if text: | ||
yield [newline, space, text] | ||
prev[0] = tree.text | ||
space = ' ' | ||
newline = '' | ||
elif newline: | ||
yield [newline] | ||
newline = '' | ||
|
||
for child in tree: | ||
for t in traverse_text_fragments(child, prev, depth + 1): | ||
yield t | ||
|
||
if guess_page_layout: | ||
newline, prev[0] = add_newline(tree.tag, prev[0]) | ||
|
||
tail = '' | ||
if tree.tail and depth != 0: | ||
tail = _whitespace.sub(' ', tree.tail.strip()) | ||
if tail: | ||
if guess_punct_space and not add_space(tail, prev[0]): | ||
space = '' | ||
if tail: | ||
yield [newline, space, tail] | ||
prev[0] = tree.tail | ||
elif newline: | ||
yield [newline] | ||
|
||
text = [] | ||
for fragment in traverse_text_fragments(tree, [None], 0): | ||
text.extend(fragment) | ||
return ''.join(text).strip() | ||
|
||
|
||
def selector_to_text(sel, guess_punct_space=True, guess_page_layout=False): | ||
""" Convert a cleaned selector to text. | ||
See html_text.extract_text docstring for description of the approach | ||
and options. | ||
""" | ||
if isinstance(sel, list): | ||
# if selecting a specific xpath | ||
text = [] | ||
for t in sel: | ||
extracted = _html_to_text( | ||
t.root, | ||
guess_punct_space=guess_punct_space, | ||
guess_page_layout=guess_page_layout) | ||
if extracted: | ||
text.append(extracted) | ||
return ' '.join(text) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is to have it work as the previous implementation however I think it would make more sense to have it return a list of the text extracted by each selector. This way the user can decide whether and how to join it. Maybe they need all text as separate entities and that's why they want to select specific elements. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree it is not clear which behavior is better; let's keep the current one, but have this problem in mind. |
||
else: | ||
fragments = (x.strip() for x in sel.xpath('.//text()').extract()) | ||
return _whitespace.sub(' ', ' '.join(x for x in fragments if x)) | ||
return _html_to_text( | ||
sel.root, | ||
guess_punct_space=guess_punct_space, | ||
guess_page_layout=guess_page_layout) | ||
|
||
|
||
def cleaned_selector(html): | ||
|
@@ -76,16 +163,18 @@ def cleaned_selector(html): | |
try: | ||
tree = _cleaned_html_tree(html) | ||
sel = parsel.Selector(root=tree, type='html') | ||
except (lxml.etree.XMLSyntaxError, | ||
lxml.etree.ParseError, | ||
lxml.etree.ParserError, | ||
UnicodeEncodeError): | ||
except (lxml.etree.XMLSyntaxError, lxml.etree.ParseError, | ||
lxml.etree.ParserError, UnicodeEncodeError): | ||
kmike marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# likely plain text | ||
sel = parsel.Selector(html) | ||
return sel | ||
|
||
|
||
def extract_text(html, guess_punct_space=True): | ||
def extract_text(html, | ||
guess_punct_space=True, | ||
guess_page_layout=False, | ||
newline_tags=NEWLINE_TAGS, | ||
double_newline_tags=DOUBLE_NEWLINE_TAGS): | ||
""" | ||
Convert html to text, cleaning invisible content such as styles. | ||
Almost the same as normalize-space xpath, but this also | ||
|
@@ -96,7 +185,23 @@ def extract_text(html, guess_punct_space=True): | |
for punctuation. This has a slight (around 10%) performance overhead | ||
and is just a heuristic. | ||
|
||
When guess_page_layout is True (default is False), a newline is added | ||
before and after newline_tags and two newlines are added before and after | ||
double_newline_tags. This heuristic makes the extracted text more similar | ||
to how it is rendered in the browser. | ||
|
||
NEWLINE_TAGS and DOUBLE_NEWLINE_TAGS can be extended, check readme for | ||
kmike marked this conversation as resolved.
Show resolved
Hide resolved
|
||
an example on how to do it. | ||
|
||
html should be a unicode string or an already parsed lxml.html element. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. guess_page_layout argument should be documented. |
||
""" | ||
sel = cleaned_selector(html) | ||
return selector_to_text(sel, guess_punct_space=guess_punct_space) | ||
if html is None or len(html) == 0: | ||
return '' | ||
cleaned = _cleaned_html_tree(html) | ||
return _html_to_text( | ||
cleaned, | ||
guess_punct_space=guess_punct_space, | ||
guess_page_layout=guess_page_layout, | ||
newline_tags=newline_tags, | ||
double_newline_tags=double_newline_tags, | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function was useful - the main use case is to extract text from a part of a web page, finding this part using Scrapy or parsel.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can get a parsed tree for a selector using
sel.root
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok so you want to create a selector in
extract_text(html)
and then applytraverse_text_fragments
onsel.root
, correct?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if
selector_to_text
is supported,cleaned_selector
is also nice to haveThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Kebniss no, extract_text doesn't need to use Selector, it is an additional overhead. The idea is to be backwards compatible and provide the same feature for Selector; internally it can work the other way around - likely selector_to_text should pass sel.root to html_to_text.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we have an
tail_text
argument in html_to_text? When it is False, tail extraction is skipped, but only on top level (i.e. text is still extracted from children tails). It can be False by default - I don't see why would anyone want to extract text from the tail of the root element.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Alternatively, we can just not extract text from element tail by default, on the top level (i.e. children should have tail text processed as usual).
In a common case (root
<html>
element) there shouldn't be any text in the tail. And when user passes another element explicitly, extracting text from element tail is likely undesirable - it is the same issues as with Selectors.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you are right, tail text is outside selected nodes and as such it should not be extracted. Not extracting it by default seems reasonable. I will add the root object as argument to check when the recursion call is processing it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do you need a root object, can't is just be a boolean flag
process_tail
in some internal function?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yy, root node is unnecessary. I added a depth argument so that we know when the recursion is back to root and does not extract tail there