Skip to content

Commit

Permalink
Make css sanitization non-default, fix docs, fix tests (#633)
Browse files Browse the repository at this point in the history
In order to use css sanitization, you have to install the css extras
which installs tinycss2. Additionally, I reworked css sanitization to be
encapsulated in a class making it easier for developers to provide their
own if they want to.

I changed the ALLOWED_CSS_PROPERTIES (previously called styles) to match
what html5lib has.

I updated the tests and documentation accordingly.
  • Loading branch information
willkg committed Apr 1, 2022
1 parent efc224d commit cb6dca7
Show file tree
Hide file tree
Showing 11 changed files with 271 additions and 173 deletions.
11 changes: 5 additions & 6 deletions bleach/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from bleach.sanitizer import (
ALLOWED_ATTRIBUTES,
ALLOWED_PROTOCOLS,
ALLOWED_STYLES,
ALLOWED_TAGS,
Cleaner,
)
Expand All @@ -24,10 +23,10 @@ def clean(
text,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
css_sanitizer=None,
):
"""Clean an HTML fragment of malicious content and return it
Expand Down Expand Up @@ -59,26 +58,26 @@ def clean(
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip: whether or not to strip disallowed elements
:arg bool strip_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
:returns: cleaned text as unicode
"""
cleaner = Cleaner(
tags=tags,
attributes=attributes,
styles=styles,
protocols=protocols,
strip=strip,
strip_comments=strip_comments,
css_sanitizer=css_sanitizer,
)
return cleaner.clean(text)

Expand Down
108 changes: 108 additions & 0 deletions bleach/css_sanitizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import tinycss2


ALLOWED_CSS_PROPERTIES = frozenset(
(
"azimuth",
"background-color",
"border-bottom-color",
"border-collapse",
"border-color",
"border-left-color",
"border-right-color",
"border-top-color",
"clear",
"color",
"cursor",
"direction",
"display",
"elevation",
"float",
"font",
"font-family",
"font-size",
"font-style",
"font-variant",
"font-weight",
"height",
"letter-spacing",
"line-height",
"overflow",
"pause",
"pause-after",
"pause-before",
"pitch",
"pitch-range",
"richness",
"speak",
"speak-header",
"speak-numeral",
"speak-punctuation",
"speech-rate",
"stress",
"text-align",
"text-decoration",
"text-indent",
"unicode-bidi",
"vertical-align",
"voice-family",
"volume",
"white-space",
"width",
)
)


ALLOWED_SVG_PROPERTIES = frozenset(
(
"fill",
"fill-opacity",
"fill-rule",
"stroke",
"stroke-width",
"stroke-linecap",
"stroke-linejoin",
"stroke-opacity",
)
)


class CSSSanitizer:
def __init__(
self,
allowed_css_properties=ALLOWED_CSS_PROPERTIES,
allowed_svg_properties=ALLOWED_SVG_PROPERTIES,
):
self.allowed_css_properties = allowed_css_properties
self.allowed_svg_properties = allowed_svg_properties

def sanitize_css(self, style):
"""Sanitizes css in style tags"""
parsed = tinycss2.parse_declaration_list(style)

if not parsed:
return ""

new_tokens = []
for token in parsed:
if token.type == "at-rule":
print("omg")
elif token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif token.type in ("comment", "whitespace"):
if new_tokens and new_tokens[-1].type != token.type:
new_tokens.append(token)
# Declaration
# AtRule
# Comment
# WhitespaceToken
# ParseError

if not new_tokens:
return ""

return tinycss2.serialize(new_tokens).strip()
2 changes: 2 additions & 0 deletions bleach/html5lib_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
allowed_protocols,
allowed_css_properties,
allowed_svg_properties,
) # noqa: E402 module level import not at top of file
from bleach._vendor.html5lib.filters.sanitizer import (
Filter as SanitizerFilter,
Expand Down
83 changes: 31 additions & 52 deletions bleach/sanitizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import warnings

from bleach._vendor.parse import urlparse
import tinycss2
from xml.sax.saxutils import unescape

from bleach import html5lib_shim
Expand Down Expand Up @@ -33,9 +32,6 @@
"acronym": ["title"],
}

#: List of allowed styles
ALLOWED_STYLES = []

#: List of allowed protocols
ALLOWED_PROTOCOLS = ["http", "https", "mailto"]

Expand Down Expand Up @@ -85,11 +81,11 @@ def __init__(
self,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
protocols=ALLOWED_PROTOCOLS,
strip=False,
strip_comments=True,
filters=None,
css_sanitizer=None,
):
"""Initializes a Cleaner
Expand All @@ -99,9 +95,6 @@ def __init__(
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
Expand All @@ -118,14 +111,17 @@ def __init__(
Using filters changes the output of ``bleach.Cleaner.clean``.
Make sure the way the filters change the output are secure.
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.tags = tags
self.attributes = attributes
self.styles = styles
self.protocols = protocols
self.strip = strip
self.strip_comments = strip_comments
self.filters = filters or []
self.css_sanitizer = css_sanitizer

self.parser = html5lib_shim.BleachHTMLParser(
tags=self.tags,
Expand Down Expand Up @@ -175,11 +171,10 @@ def clean(self, text):
attributes=self.attributes,
strip_disallowed_elements=self.strip,
strip_html_comments=self.strip_comments,
css_sanitizer=self.css_sanitizer,
# html5lib-sanitizer things
allowed_elements=self.tags,
allowed_css_properties=self.styles,
allowed_protocols=self.protocols,
allowed_svg_properties=[],
)

# Apply any filters after the BleachSanitizerFilter
Expand Down Expand Up @@ -242,36 +237,40 @@ class BleachSanitizerFilter(html5lib_shim.SanitizerFilter):
def __init__(
self,
source,
allowed_elements=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
allowed_protocols=ALLOWED_PROTOCOLS,
strip_disallowed_elements=False,
strip_html_comments=True,
css_sanitizer=None,
**kwargs,
):
"""Creates a BleachSanitizerFilter instance
:arg Treewalker source: stream
:arg list tags: allowed list of tags; defaults to
:arg list allowed_elements: allowed list of tags; defaults to
``bleach.sanitizer.ALLOWED_TAGS``
:arg dict attributes: allowed attributes; can be a callable, list or dict;
defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES``
:arg list styles: allowed list of css styles; defaults to
``bleach.sanitizer.ALLOWED_STYLES``
:arg list protocols: allowed list of protocols for links; defaults
:arg list allowed_protocols: allowed list of protocols for links; defaults
to ``bleach.sanitizer.ALLOWED_PROTOCOLS``
:arg bool strip_disallowed_elements: whether or not to strip disallowed
elements
:arg bool strip_html_comments: whether or not to strip HTML comments
:arg CSSSanitizer css_sanitizer: instance with a "sanitize_css" method for
sanitizing style attribute values and style text; defaults to None
"""
self.attr_filter = attribute_filter_factory(attributes)
self.strip_disallowed_elements = strip_disallowed_elements
self.strip_html_comments = strip_html_comments
self.css_sanitizer = css_sanitizer

# filter out html5lib deprecation warnings to use bleach from BleachSanitizerFilter init
warnings.filterwarnings(
Expand All @@ -280,7 +279,12 @@ def __init__(
category=DeprecationWarning,
module="bleach._vendor.html5lib",
)
return super().__init__(source, **kwargs)
return super().__init__(
source,
allowed_elements=allowed_elements,
allowed_protocols=allowed_protocols,
**kwargs,
)

def sanitize_stream(self, token_iterator):
for token in token_iterator:
Expand Down Expand Up @@ -542,7 +546,16 @@ def allow_token(self, token):

# If it's a style attribute, sanitize it
if namespaced_name == (None, "style"):
val = self.sanitize_css(val)
if self.css_sanitizer:
val = self.css_sanitizer.sanitize_css(val)
else:
# FIXME(willkg): if style is allowed, but no
# css_sanitizer was set up, then this is probably a
# mistake and we should raise an error here
#
# For now, we're going to set the value to "" because
# there was no sanitizer set
val = ""

# At this point, we want to keep the attribute, so add it in
attrs[namespaced_name] = val
Expand Down Expand Up @@ -594,37 +607,3 @@ def disallowed_token(self, token):

del token["name"]
return token

def sanitize_css(self, style):
"""Sanitizes css in style tags"""
parsed = tinycss2.parse_declaration_list(style)

if not parsed:
return ""

# decl.name.lower() in self.allowed_css_properties
# or decl.name.lower() in self.allowed_svg_properties

new_tokens = []
for token in parsed:
if token.type == "at-rule":
print("omg")
elif token.type == "declaration":
if (
token.lower_name in self.allowed_css_properties
or token.lower_name in self.allowed_svg_properties
):
new_tokens.append(token)
elif token.type in ("comment", "whitespace"):
if new_tokens and new_tokens[-1].type != token.type:
new_tokens.append(token)
# Declaration
# AtRule
# Comment
# WhitespaceToken
# ParseError

if not new_tokens:
return ""

return tinycss2.serialize(new_tokens).strip()
Loading

0 comments on commit cb6dca7

Please sign in to comment.