Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Google Code Issue 157: Add "escape invisible characters" option #38

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions html5lib/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import gettext
_ = gettext.gettext

from itertools import chain


EOF = None

E = {
Expand Down Expand Up @@ -3078,6 +3081,19 @@
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"


invisibleChars = frozenset(chain(
# ASCII control chars
range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20),
# Other control chars
# fixed-width spaces, zero-width marks, bidi marks
range(0x2000, 0x2010),
# LS, PS, bidi control codes
range(0x2028, 0x2030),
# nbsp, mathsp, ideosp, WJ, interlinear
[0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
))


class DataLossWarning(UserWarning):
pass

Expand Down
10 changes: 9 additions & 1 deletion html5lib/serializer/htmlserializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ class HTMLSerializer(object):
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
escape_invisible = False
resolve_entities = True

# miscellaneous options
Expand All @@ -105,7 +106,8 @@ class HTMLSerializer(object):
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", "resolve_entities", "sanitize")
"escape_rcdata", "escape_invisible", "resolve_entities",
"sanitize")

def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Expand All @@ -127,6 +129,10 @@ def __init__(self, **kwargs):
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
escape_invisible=False|True|'numeric'|'named'
Whether to escape invisible characters (such as nbsp, fixed-width
spaces, and control codes). Uses named HTML escapes if 'named'
is specified, otherwise uses numeric codes.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities < > & " '
Expand Down Expand Up @@ -160,6 +166,8 @@ def __init__(self, **kwargs):

def encode(self, string):
assert(isinstance(string, text_type))
if self.escape_invisible:
string = utils.escapeInvisible(string, self.escape_invisible == 'named')
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
Expand Down
28 changes: 28 additions & 0 deletions html5lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

from types import ModuleType

from six import text_type

from .constants import invisibleChars


class MethodDispatcher(dict):
"""Dict with 2 special properties:
Expand Down Expand Up @@ -71,3 +75,27 @@ def moduleFactory(baseModule, *args, **kwargs):
return mod

return moduleFactory


def escapeInvisible(text, useNamedEntities=False):
"""Escape invisible characters other than Tab, LF, CR, and ASCII space
"""
assert type(text) == text_type
# This algorithm is O(MN) for M len(text) and N num escapable
# But it doesn't modify the text when N is zero (common case) and
# N is expected to be small (usually 1 or 2) in most other cases.
escapable = set()
for c in text:
if ord(c) in invisibleChars:
escapable.add(c)
if useNamedEntities:
# for c in escapable:
# name = codepoint2name.get(ord(c))
# escape = "&%s;" % name if name else "&#x%X;" % ord(c)
# text = text.replace(c, escape)
raise NotImplementedError("This doesn't work on Python 3")
else:
for c in escapable:
text = text.replace(c, "&#x%X;" % ord(c))

return text