diff --git a/.flake8 b/.flake8
index 7e5efc6..cffc2c6 100644
--- a/.flake8
+++ b/.flake8
@@ -1,5 +1,6 @@
[flake8]
-ignore = E203,W503
+max-line-length = 88
+ignore = E203,W503,E701
per-file-ignores =
docs/conftest.py:E501
parsel/csstranslator.py:E501
diff --git a/.gitignore b/.gitignore
index 9a1e3c0..20dec10 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,6 +29,7 @@ pip-log.txt
nosetests.xml
htmlcov
.pytest_cache
+coverage.xml
# Translations
*.mo
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db43480..42a15fc 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,14 +4,14 @@ repos:
hooks:
- id: bandit
args: [-r, -c, .bandit.yml]
-- repo: https://github.com/PyCQA/flake8
- rev: 7.0.0
- hooks:
- - id: flake8
- repo: https://github.com/psf/black.git
rev: 24.2.0
hooks:
- id: black
+- repo: https://github.com/PyCQA/flake8
+ rev: 7.0.0
+ hooks:
+ - id: flake8
- repo: https://github.com/pycqa/isort
rev: 5.13.2
hooks:
diff --git a/docs/usage.rst b/docs/usage.rst
index e3eb91f..0b97d8b 100644
--- a/docs/usage.rst
+++ b/docs/usage.rst
@@ -140,6 +140,19 @@ pseudo-elements::
>>> selector.css('title::text').get()
'Example website'
+To extract all text of one or more element and all their child elements,
+formatted as plain text taking into account HTML tags (e.g. ``
`` is
+translated as a line break), set ``text=True`` in your call to
+:meth:`~parsel.selector.Selector.get` or
+:meth:`~parsel.selector.Selector.getall` instead of including
+``::text`` (CSS) or ``/text()`` (XPath) in your query::
+
+ >>> selector.css('#images').get(text=True)
+ 'Name: My image 1\nName: My image 2\nName: My image 3\nName: My image 4\nName: My image 5'
+
+See :meth:`Selector.get` for additional parameters that you can use to change
+how the extracted plain text is formatted.
+
As you can see, ``.xpath()`` and ``.css()`` methods return a
:class:`~parsel.selector.SelectorList` instance, which is a list of new
selectors. This API can be used for quickly selecting nested data::
diff --git a/parsel/selector.py b/parsel/selector.py
index 2027599..104db9d 100644
--- a/parsel/selector.py
+++ b/parsel/selector.py
@@ -22,8 +22,10 @@
)
from warnings import warn
+import html_text # type: ignore[import-untyped]
import jmespath
from lxml import etree, html
+from lxml.html.clean import Cleaner # pylint: disable=no-name-in-module
from packaging.version import Version
from .csstranslator import GenericTranslator, HTMLTranslator
@@ -245,30 +247,68 @@ def re_first(
return typing.cast(str, el)
return default
- def getall(self) -> List[str]:
+ def getall(
+ self,
+ *,
+ text: bool = False,
+ cleaner: Union[str, None, Cleaner] = "auto",
+ guess_punct_space: bool = True,
+ guess_layout: bool = True,
+ ) -> List[str]:
"""
Call the ``.get()`` method for each element is this list and return
their results flattened, as a list of strings.
- """
- return [x.get() for x in self]
-
- extract = getall
- @typing.overload
- def get(self, default: None = None) -> Optional[str]:
- pass
+ ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+ options are passed to :meth:`~.Selector.get`; see
+ :meth:`~.Selector.get` for more details.
+
+ .. note::
+
+ When either text extraction or cleaning is requested, they're
+ performed on each element in the list individually. So, if you match
+ nested elements (i.e. both parent and descendant), cleaning or
+ text extraction could be run multiple times on the same part
+ of the tree. For example, ``selector.xpath("*").getall(text=True)``
+ has O(N^2) complexity regarding the number of nodes in the tree,
+ not O(N).
+ """
+ return [
+ x.get(
+ text=text,
+ cleaner=cleaner,
+ guess_punct_space=guess_punct_space,
+ guess_layout=guess_layout,
+ )
+ for x in self
+ ]
- @typing.overload
- def get(self, default: str) -> str:
- pass
+ extract = getall
- def get(self, default: Optional[str] = None) -> Any:
+ def get(
+ self,
+ default: Optional[str] = None,
+ *,
+ text: bool = False,
+ cleaner: Union[str, None, Cleaner] = "auto",
+ guess_punct_space: bool = True,
+ guess_layout: bool = True,
+ ) -> Any:
"""
Return the result of ``.get()`` for the first element in this list.
- If the list is empty, return the default value.
+ If the list is empty, return the ``default`` value.
+
+ ``text``, ``cleaner``, ``guess_punct_space`` and ``guess_layout``
+ options are passed to :meth:`Selector.get`; see :meth:`~.Selector.get`
+ for more details.
"""
for x in self:
- return x.get()
+ return x.get(
+ text=text,
+ cleaner=cleaner,
+ guess_punct_space=guess_punct_space,
+ guess_layout=guess_layout,
+ )
return default
extract_first = get
@@ -439,6 +479,8 @@ class Selector:
}
_lxml_smart_strings = False
selectorlist_cls = SelectorList["Selector"]
+ _text_cleaner = html_text.cleaner
+ _html_cleaner = Cleaner()
def __init__(
self,
@@ -715,40 +757,110 @@ def re_first(
default,
)
- def get(self) -> Any:
+ def get(
+ self,
+ *,
+ text: bool = False,
+ cleaner: Union[str, None, Cleaner] = "auto",
+ guess_punct_space: bool = True,
+ guess_layout: bool = True,
+ ) -> Any:
"""
Serialize and return the matched nodes.
For HTML and XML, the result is always a string, and percent-encoded
content is unquoted.
+
+ When ``text`` is False (default), HTML or XML is extracted. Pass
+ ``text=True`` to extract text content (html-text library is used).
+ Text extraction algorithm assumes that the document is an HTML
+ document, and uses HTML-specific rules.
+
+ ``cleaner`` argument allows cleaning HTML before extracting the
+ content. Allowed values:
+
+ * "auto" (default) - don't clean when text=False, clean with
+ options tuned for text extraction when text=True;
+ * "text" - clean with options tuned for text extraction: elements
+ like ``
hello
hello
P
P
P
hello"Folks"
') + assert sel.get(text=True, guess_punct_space=False) == 'hello "Folks"' + assert sel.get(text=True, guess_punct_space=True) == 'hello"Folks"' + + assert sel.getall(text=True, guess_punct_space=False) == ['hello "Folks"'] + assert sel.getall(text=True, guess_punct_space=True) == ['hello"Folks"'] + + +def test_guess_layout() -> None: + sel = Selector("