diff --git a/lib/coradoc.rb b/lib/coradoc.rb index fecd74a..ee0f56d 100644 --- a/lib/coradoc.rb +++ b/lib/coradoc.rb @@ -4,6 +4,7 @@ require "parslet" require_relative "coradoc/version" +require_relative "coradoc/util" require_relative "coradoc/parser" require_relative "coradoc/transformer" require_relative "coradoc/generator" diff --git a/lib/coradoc/element/list_item.rb b/lib/coradoc/element/list_item.rb index 91383d2..a9748d1 100644 --- a/lib/coradoc/element/list_item.rb +++ b/lib/coradoc/element/list_item.rb @@ -14,7 +14,13 @@ def initialize(content, options = {}) def to_adoc anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s content = Array(@content).map do |subitem| - Coradoc::Generator.gen_adoc(subitem).chomp + subcontent = Coradoc::Generator.gen_adoc(subitem) + # Only try to postprocess elements that are text, + # otherwise we could strip markup. + if Coradoc.is_a_single?(subitem, Coradoc::Element::TextElement) + subcontent = Coradoc.strip_unicode(subcontent) + end + subcontent.chomp end.join("\n+\n") " #{anchor}#{content.chomp}\n" diff --git a/lib/coradoc/element/paragraph.rb b/lib/coradoc/element/paragraph.rb index e494170..01c3b33 100644 --- a/lib/coradoc/element/paragraph.rb +++ b/lib/coradoc/element/paragraph.rb @@ -24,9 +24,9 @@ def texts def to_adoc anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n" if @tdsinglepara - anchor.to_s << Coradoc::Generator.gen_adoc(@content).strip + anchor.to_s << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) else - "\n\n#{anchor}" << Coradoc::Generator.gen_adoc(@content).strip << "\n\n" + "\n\n#{anchor}" << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) << "\n\n" end end end diff --git a/lib/coradoc/element/section.rb b/lib/coradoc/element/section.rb index 40a1852..21918de 100644 --- a/lib/coradoc/element/section.rb +++ b/lib/coradoc/element/section.rb @@ -34,6 +34,12 @@ def to_adoc # with something. content = " #{content}" if content.start_with?(" +\n") + # Only try to postprocess elements that are text, + # otherwise we could strip markup. + if Coradoc.is_a_single?(@contents, Coradoc::Element::TextElement) + content = Coradoc.strip_unicode(content) + end + "\n#{anchor}" << title << content << sections << "\n" end diff --git a/lib/coradoc/element/table.rb b/lib/coradoc/element/table.rb index 1be5146..a4cb3f8 100644 --- a/lib/coradoc/element/table.rb +++ b/lib/coradoc/element/table.rb @@ -76,6 +76,11 @@ def to_adoc anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s content = simplify_block_content(@content) content = Coradoc::Generator.gen_adoc(content) + # Only try to postprocess elements that are text, + # otherwise we could strip markup. + if Coradoc.is_a_single?(@content, Coradoc::Element::TextElement) + content = Coradoc.strip_unicode(content) + end "#{@colrowattr}#{@alignattr}#{@style}| #{anchor}#{content}" end end diff --git a/lib/coradoc/element/title.rb b/lib/coradoc/element/title.rb index ef18977..8e64e9f 100644 --- a/lib/coradoc/element/title.rb +++ b/lib/coradoc/element/title.rb @@ -21,7 +21,7 @@ def level def to_adoc anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n" - content = Coradoc::Generator.gen_adoc(@content) + content = Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) <<~HERE #{anchor}#{style_str}#{level_str} #{content} diff --git a/lib/coradoc/reverse_adoc/cleaner.rb b/lib/coradoc/reverse_adoc/cleaner.rb index 7aa6d32..96a731b 100644 --- a/lib/coradoc/reverse_adoc/cleaner.rb +++ b/lib/coradoc/reverse_adoc/cleaner.rb @@ -83,8 +83,7 @@ def preprocess_word_html(string) def scrub_whitespace(string) string.gsub!(/ | |\u00a0/i, " ") # HTML encoded spaces - string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace - string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace + string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace string.gsub!(/( +)$/, " ") # line trailing whitespace string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks # string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs diff --git a/lib/coradoc/reverse_adoc/plugins/plateau.rb b/lib/coradoc/reverse_adoc/plugins/plateau.rb index abcdd3a..5146147 100644 --- a/lib/coradoc/reverse_adoc/plugins/plateau.rb +++ b/lib/coradoc/reverse_adoc/plugins/plateau.rb @@ -144,7 +144,12 @@ def handle_headers_h4(node, coradoc, state) coradoc.content.first.content = $1.strip coradoc else - ["// FIXME\n", coradoc] + if Coradoc.strip_unicode(coradoc.content.first.content).empty? + # Strip instances of faulty empty paragraphs + nil + else + ["// FIXME\n", coradoc] + end end end diff --git a/lib/coradoc/util.rb b/lib/coradoc/util.rb new file mode 100644 index 0000000..e9ede72 --- /dev/null +++ b/lib/coradoc/util.rb @@ -0,0 +1,10 @@ +module Coradoc + def self.strip_unicode(str) + str.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "") + end + + def self.is_a_single?(obj, klass) + obj.is_a?(klass) || + (obj.is_a?(Array) && obj.length == 1 && obj.first.is_a?(klass)) + end +end diff --git a/spec/reverse_adoc/assets/unicode_space.html b/spec/reverse_adoc/assets/unicode_space.html new file mode 100644 index 0000000..50584b8 --- /dev/null +++ b/spec/reverse_adoc/assets/unicode_space.html @@ -0,0 +1,24 @@ + + + + + + + + + + +
 test1   test2  
+ +

 test3 

+ + + +
 test6 
+ +

 test7 

diff --git a/spec/reverse_adoc/components/lists_spec.rb b/spec/reverse_adoc/components/lists_spec.rb index efe1b3a..643e265 100644 --- a/spec/reverse_adoc/components/lists_spec.rb +++ b/spec/reverse_adoc/components/lists_spec.rb @@ -42,10 +42,10 @@ end context "nested list with lots of whitespace" do - it { is_expected.to match /\n\* item wa \n/ } - it { is_expected.to match /\n\* item wb \n/ } - it { is_expected.to match /\n\*\* item wbb \n/ } - it { is_expected.to match /\n\*\* item wbc \n/ } + it { is_expected.to match /\n\* item wa\n/ } + it { is_expected.to match /\n\* item wb\n/ } + it { is_expected.to match /\n\*\* item wbb\n/ } + it { is_expected.to match /\n\*\* item wbc\n/ } end context "lists containing links" do diff --git a/spec/reverse_adoc/components/unicode_space_spec.rb b/spec/reverse_adoc/components/unicode_space_spec.rb new file mode 100644 index 0000000..f4eb748 --- /dev/null +++ b/spec/reverse_adoc/components/unicode_space_spec.rb @@ -0,0 +1,14 @@ +require "spec_helper" + +describe Coradoc::ReverseAdoc do + let(:input) { File.read("spec/reverse_adoc/assets/unicode_space.html") } + let(:document) { Nokogiri::HTML(input) } + subject { Coradoc::ReverseAdoc.convert(input) } + + it { should include "\n| test1 | | test2 | \n" } + it { should include "\ntest3\n" } + it { should include "\n* test4\n" } + it { should include "\n.. test5\n" } + it { should include "\ntest6\n" } + it { should include "\n==== test7\n" } +end