metanorma · ronaldtse · Jun 4, 2024 · Jun 1, 2024 · Jun 3, 2024 · Jun 4, 2024
diff --git a/lib/coradoc.rb b/lib/coradoc.rb
@@ -4,6 +4,7 @@
 
 require "parslet"
 require_relative "coradoc/version"
+require_relative "coradoc/util"
 require_relative "coradoc/parser"
 require_relative "coradoc/transformer"
 require_relative "coradoc/generator"

diff --git a/lib/coradoc/element/list_item.rb b/lib/coradoc/element/list_item.rb
@@ -14,7 +14,13 @@ def initialize(content, options = {})
       def to_adoc
         anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
         content = Array(@content).map do |subitem|
-          Coradoc::Generator.gen_adoc(subitem).chomp
+          subcontent = Coradoc::Generator.gen_adoc(subitem)
+          # Only try to postprocess elements that are text,
+          # otherwise we could strip markup.
+          if Coradoc.is_a_single?(subitem, Coradoc::Element::TextElement)
+            subcontent = Coradoc.strip_unicode(subcontent)
+          end
+          subcontent.chomp
         end.join("\n+\n")
 
         " #{anchor}#{content.chomp}\n"

diff --git a/lib/coradoc/element/paragraph.rb b/lib/coradoc/element/paragraph.rb
@@ -24,9 +24,9 @@ def texts
       def to_adoc
         anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
         if @tdsinglepara
-          anchor.to_s << Coradoc::Generator.gen_adoc(@content).strip
+          anchor.to_s << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
         else
-          "\n\n#{anchor}" << Coradoc::Generator.gen_adoc(@content).strip << "\n\n"
+          "\n\n#{anchor}" << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) << "\n\n"
         end
       end
     end

diff --git a/lib/coradoc/element/section.rb b/lib/coradoc/element/section.rb
@@ -34,6 +34,12 @@ def to_adoc
         # with something.
         content = "&nbsp;#{content}" if content.start_with?(" +\n")
 
+        # Only try to postprocess elements that are text,
+        # otherwise we could strip markup.
+        if Coradoc.is_a_single?(@contents, Coradoc::Element::TextElement)
+          content = Coradoc.strip_unicode(content)
+        end
+
         "\n#{anchor}" << title << content << sections << "\n"
       end
 

diff --git a/lib/coradoc/element/table.rb b/lib/coradoc/element/table.rb
@@ -76,6 +76,11 @@ def to_adoc
           anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
           content = simplify_block_content(@content)
           content = Coradoc::Generator.gen_adoc(content)
+          # Only try to postprocess elements that are text,
+          # otherwise we could strip markup.
+          if Coradoc.is_a_single?(@content, Coradoc::Element::TextElement)
+            content = Coradoc.strip_unicode(content)
+          end
           "#{@colrowattr}#{@alignattr}#{@style}| #{anchor}#{content}"
         end
       end

diff --git a/lib/coradoc/element/title.rb b/lib/coradoc/element/title.rb
@@ -21,7 +21,7 @@ def level
 
       def to_adoc
         anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
-        content = Coradoc::Generator.gen_adoc(@content)
+        content = Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
         <<~HERE
 
         #{anchor}#{style_str}#{level_str} #{content}

diff --git a/lib/coradoc/reverse_adoc/cleaner.rb b/lib/coradoc/reverse_adoc/cleaner.rb
@@ -83,8 +83,7 @@ def preprocess_word_html(string)
 
     def scrub_whitespace(string)
       string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
-      string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
-      string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
+      string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
       string.gsub!(/( +)$/, " ") # line trailing whitespace
       string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
       # string.delete!('?| ')               # Unicode non-breaking spaces, injected as tabs

diff --git a/lib/coradoc/reverse_adoc/plugins/plateau.rb b/lib/coradoc/reverse_adoc/plugins/plateau.rb
@@ -144,7 +144,12 @@ def handle_headers_h4(node, coradoc, state)
           coradoc.content.first.content = $1.strip
           coradoc
         else
-          ["// FIXME\n", coradoc]
+          if Coradoc.strip_unicode(coradoc.content.first.content).empty?
+            # Strip instances of faulty empty paragraphs
+            nil
+          else
+            ["// FIXME\n", coradoc]
+          end
         end
       end
 

diff --git a/lib/coradoc/util.rb b/lib/coradoc/util.rb
@@ -0,0 +1,10 @@
+module Coradoc
+  def self.strip_unicode(str)
+    str.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
+  end
+
+  def self.is_a_single?(obj, klass)
+    obj.is_a?(klass) ||
+      (obj.is_a?(Array) && obj.length == 1 && obj.first.is_a?(klass))
+  end
+end
diff --git a/spec/reverse_adoc/assets/unicode_space.html b/spec/reverse_adoc/assets/unicode_space.html
@@ -0,0 +1,24 @@
+<!-- This document includes spaces that are double-width spaces. -->
+<!-- This is for CJK documents. -->
+
+<table>
+  <tr>
+    <th>　test1　</th>
+    <td>　</td>
+    <td>　test2　</td>
+    <td>　</td>
+  </tr>
+</table>
+
+<p>　test3　</p>
+
+<ul>
+  <li>　test4　</li>
+  <ol>
+    <li>　test5　</li>
+  </ol>
+</ul>
+
+<div id="test">　test6　</div>
+
+<h3>　test7　</h3>
diff --git a/spec/reverse_adoc/components/lists_spec.rb b/spec/reverse_adoc/components/lists_spec.rb
@@ -42,10 +42,10 @@
   end
 
   context "nested list with lots of whitespace" do
-    it { is_expected.to match /\n\* item wa \n/ }
-    it { is_expected.to match /\n\* item wb \n/ }
-    it { is_expected.to match /\n\*\* item wbb \n/ }
-    it { is_expected.to match /\n\*\* item wbc \n/ }
+    it { is_expected.to match /\n\* item wa\n/ }
+    it { is_expected.to match /\n\* item wb\n/ }
+    it { is_expected.to match /\n\*\* item wbb\n/ }
+    it { is_expected.to match /\n\*\* item wbc\n/ }
   end
 
   context "lists containing links" do

diff --git a/spec/reverse_adoc/components/unicode_space_spec.rb b/spec/reverse_adoc/components/unicode_space_spec.rb
@@ -0,0 +1,14 @@
+require "spec_helper"
+
+describe Coradoc::ReverseAdoc do
+  let(:input)    { File.read("spec/reverse_adoc/assets/unicode_space.html") }
+  let(:document) { Nokogiri::HTML(input) }
+  subject { Coradoc::ReverseAdoc.convert(input) }
+
+  it { should include "\n| test1 | | test2 | \n" }
+  it { should include "\ntest3\n" }
+  it { should include "\n* test4\n" }
+  it { should include "\n.. test5\n" }
+  it { should include "\ntest6\n" }
+  it { should include "\n==== test7\n" }
+end