From 92f219a81fd9bd36169708d1d4e81f0037330315 Mon Sep 17 00:00:00 2001 From: Alper Kokmen Date: Fri, 31 Jan 2014 16:22:56 -0800 Subject: [PATCH] Convert :whitespace_elements into a Hash. This will let developers to configure Sanitize to specify the replacements for :whitespace_elements found in the input. Previously, developers couldn't specify the text to be inserted before and after the :whitespace_elements. - updated :whitespace_elements to be a Hash rather than an Array. - updated CleanElement#call to use appropriate replacements for the element found instead of hardcoded space (' '). - made sure existing tests were not broken and added new tests for the functionality added by overriding replacement values for tags. - updated README to reflect these changes. NOTE: Changes are backwards compatible. Code will detect Array types and will convert them into the new Hash format. --- README.md | 13 ++++---- lib/sanitize/config.rb | 39 +++++++++++++++++----- lib/sanitize/transformers/clean_element.rb | 15 +++++++-- test/test_sanitize.rb | 28 ++++++++++++++++ 4 files changed, 77 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 722c462..29c69ea 100644 --- a/README.md +++ b/README.md @@ -209,15 +209,14 @@ traversal. See the Transformers section below for details. Custom transformer or array of custom transformers to run using breadth-first traversal. See the Transformers section below for details. -#### :whitespace_elements (Array) +#### :whitespace_elements (Hash) -Array of lowercase element names that should be replaced with whitespace when -removed in order to preserve readability. For example, -`foo
bar
baz` will become -`foo bar baz` when the `
` is removed. +Hash of lowercase element names that should be replaced and replacement values +in order to preserve readability. For example, `foo
bar
baz` will +become `foo bar baz` when the `
` is removed. -By default, the following elements are included in the -`:whitespace_elements` array: +By default, the following elements (as keys) are included in the +`:whitespace_elements` hash: ``` address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5 diff --git a/lib/sanitize/config.rb b/lib/sanitize/config.rb index d7224c2..e4959e1 100644 --- a/lib/sanitize/config.rb +++ b/lib/sanitize/config.rb @@ -73,14 +73,37 @@ module Config :transformers_breadth => [], # Elements which, when removed, should have their contents surrounded by - # space characters to preserve readability. For example, - # `foo
bar
baz` will become 'foo bar baz' when the
is - # removed. - :whitespace_elements => %w[ - address article aside blockquote br dd div dl dt footer h1 h2 h3 h4 h5 - h6 header hgroup hr li nav ol p pre section ul - ] - + # values specified with `before` and `after` keys to preserve readability. + # For example, `foo
bar
baz` will become 'foo bar baz' when the + #
is removed. + :whitespace_elements => { + 'address' => { :before => ' ', :after => ' ' }, + 'article' => { :before => ' ', :after => ' ' }, + 'aside' => { :before => ' ', :after => ' ' }, + 'blockquote' => { :before => ' ', :after => ' ' }, + 'br' => { :before => ' ', :after => ' ' }, + 'dd' => { :before => ' ', :after => ' ' }, + 'div' => { :before => ' ', :after => ' ' }, + 'dl' => { :before => ' ', :after => ' ' }, + 'dt' => { :before => ' ', :after => ' ' }, + 'footer' => { :before => ' ', :after => ' ' }, + 'h1' => { :before => ' ', :after => ' ' }, + 'h2' => { :before => ' ', :after => ' ' }, + 'h3' => { :before => ' ', :after => ' ' }, + 'h4' => { :before => ' ', :after => ' ' }, + 'h5' => { :before => ' ', :after => ' ' }, + 'h6' => { :before => ' ', :after => ' ' }, + 'header' => { :before => ' ', :after => ' ' }, + 'hgroup' => { :before => ' ', :after => ' ' }, + 'hr' => { :before => ' ', :after => ' ' }, + 'li' => { :before => ' ', :after => ' ' }, + 'nav' => { :before => ' ', :after => ' ' }, + 'ol' => { :before => ' ', :after => ' ' }, + 'p' => { :before => ' ', :after => ' ' }, + 'pre' => { :before => ' ', :after => ' ' }, + 'section' => { :before => ' ', :after => ' ' }, + 'ul' => { :before => ' ', :after => ' ' } + } } end end diff --git a/lib/sanitize/transformers/clean_element.rb b/lib/sanitize/transformers/clean_element.rb index 7840bfc..bfff87f 100644 --- a/lib/sanitize/transformers/clean_element.rb +++ b/lib/sanitize/transformers/clean_element.rb @@ -11,7 +11,16 @@ def initialize(config) @protocols = config[:protocols] @remove_all_contents = false @remove_element_contents = Set.new - @whitespace_elements = Set.new(config[:whitespace_elements]) + @whitespace_elements = Hash.new + + # Converting :whitespace_element into a Hash for backwards compatibility. + if config[:whitespace_elements].is_a?(Array) + config[:whitespace_elements].each do |element| + @whitespace_elements[element] = { :before => ' ', :after => ' ' } + end + else + @whitespace_elements = config[:whitespace_elements] + end if config[:remove_contents].is_a?(Array) @remove_element_contents.merge(config[:remove_contents].map(&:to_s)) @@ -31,10 +40,10 @@ def call(env) # Elements like br, div, p, etc. need to be replaced with whitespace in # order to preserve readability. if @whitespace_elements.include?(name) - node.add_previous_sibling(Nokogiri::XML::Text.new(' ', node.document)) + node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document)) unless node.children.empty? - node.add_next_sibling(Nokogiri::XML::Text.new(' ', node.document)) + node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document)) end end diff --git a/test/test_sanitize.rb b/test/test_sanitize.rb index 9e2b565..7bf38a2 100644 --- a/test/test_sanitize.rb +++ b/test/test_sanitize.rb @@ -421,6 +421,21 @@ Sanitize.clean('', config) .must_equal('') # Another annoying Nokogiri quirk. end + + it 'should replace whitespace_elements with configured :before and :after values' do + config = { + :whitespace_elements => { + 'p' => { :before => "\n", :after => "\n" }, + 'div' => { :before => "\n", :after => "\n" }, + 'br' => { :before => "\n", :after => "\n" }, + } + } + + Sanitize.clean('

foo

', config).must_equal("\nfoo\n") + Sanitize.clean('

foo

bar

', config).must_equal("\nfoo\n\nbar\n") + Sanitize.clean('foo
bar
baz', config).must_equal("foo\nbar\nbaz") + Sanitize.clean('foo
bar
baz', config).must_equal("foo\nbar\nbaz") + end end describe 'Sanitize.clean' do @@ -645,3 +660,16 @@ Sanitize.clean!('foo