Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reverse_adoc: Clean Unicode whitespace in headers and paragraphs #80

Merged
merged 3 commits into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/coradoc.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

require "parslet"
require_relative "coradoc/version"
require_relative "coradoc/util"
require_relative "coradoc/parser"
require_relative "coradoc/transformer"
require_relative "coradoc/generator"
Expand Down
8 changes: 7 additions & 1 deletion lib/coradoc/element/list_item.rb
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,13 @@ def initialize(content, options = {})
def to_adoc
anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
content = Array(@content).map do |subitem|
Coradoc::Generator.gen_adoc(subitem).chomp
subcontent = Coradoc::Generator.gen_adoc(subitem)
# Only try to postprocess elements that are text,
# otherwise we could strip markup.
if Coradoc.is_a_single?(subitem, Coradoc::Element::TextElement)
subcontent = Coradoc.strip_unicode(subcontent)
end
subcontent.chomp
end.join("\n+\n")

" #{anchor}#{content.chomp}\n"
Expand Down
4 changes: 2 additions & 2 deletions lib/coradoc/element/paragraph.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ def texts
def to_adoc
anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
if @tdsinglepara
anchor.to_s << Coradoc::Generator.gen_adoc(@content).strip
anchor.to_s << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
else
"\n\n#{anchor}" << Coradoc::Generator.gen_adoc(@content).strip << "\n\n"
"\n\n#{anchor}" << Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content)) << "\n\n"
end
end
end
Expand Down
6 changes: 6 additions & 0 deletions lib/coradoc/element/section.rb
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ def to_adoc
# with something.
content = "&nbsp;#{content}" if content.start_with?(" +\n")

# Only try to postprocess elements that are text,
# otherwise we could strip markup.
if Coradoc.is_a_single?(@contents, Coradoc::Element::TextElement)
content = Coradoc.strip_unicode(content)
end

"\n#{anchor}" << title << content << sections << "\n"
end

Expand Down
5 changes: 5 additions & 0 deletions lib/coradoc/element/table.rb
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ def to_adoc
anchor = @anchor.nil? ? "" : @anchor.to_adoc.to_s
content = simplify_block_content(@content)
content = Coradoc::Generator.gen_adoc(content)
# Only try to postprocess elements that are text,
# otherwise we could strip markup.
if Coradoc.is_a_single?(@content, Coradoc::Element::TextElement)
content = Coradoc.strip_unicode(content)
end
"#{@colrowattr}#{@alignattr}#{@style}| #{anchor}#{content}"
end
end
Expand Down
2 changes: 1 addition & 1 deletion lib/coradoc/element/title.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def level

def to_adoc
anchor = @anchor.nil? ? "" : "#{@anchor.to_adoc}\n"
content = Coradoc::Generator.gen_adoc(@content)
content = Coradoc.strip_unicode(Coradoc::Generator.gen_adoc(@content))
<<~HERE

#{anchor}#{style_str}#{level_str} #{content}
Expand Down
3 changes: 1 addition & 2 deletions lib/coradoc/reverse_adoc/cleaner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,7 @@ def preprocess_word_html(string)

def scrub_whitespace(string)
string.gsub!(/&nbsp;|&#xA0;|\u00a0/i, "&#xA0;") # HTML encoded spaces
string.sub!(/^\A[[:space:]]+/m, "") # document leading whitespace
string.sub!(/[[:space:]]+\z$/m, "") # document trailing whitespace
string = Coradoc.strip_unicode(string) # Strip document-level leading and trailing whitespace
string.gsub!(/( +)$/, " ") # line trailing whitespace
string.gsub!(/\n\n\n\n/, "\n\n") # Quadruple line breaks
# string.delete!('?| ') # Unicode non-breaking spaces, injected as tabs
Expand Down
7 changes: 6 additions & 1 deletion lib/coradoc/reverse_adoc/plugins/plateau.rb
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,12 @@ def handle_headers_h4(node, coradoc, state)
coradoc.content.first.content = $1.strip
coradoc
else
["// FIXME\n", coradoc]
if Coradoc.strip_unicode(coradoc.content.first.content).empty?
# Strip instances of faulty empty paragraphs
nil
else
["// FIXME\n", coradoc]
end
end
end

Expand Down
10 changes: 10 additions & 0 deletions lib/coradoc/util.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
module Coradoc
def self.strip_unicode(str)
str.gsub(/\A[[:space:]]+|[[:space:]]+\z/, "")
end

def self.is_a_single?(obj, klass)
obj.is_a?(klass) ||
(obj.is_a?(Array) && obj.length == 1 && obj.first.is_a?(klass))
end
end
24 changes: 24 additions & 0 deletions spec/reverse_adoc/assets/unicode_space.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<!-- This document includes spaces that are double-width spaces. -->
<!-- This is for CJK documents. -->

<table>
<tr>
<th> test1 </th>
<td> </td>
<td> test2 </td>
<td> </td>
</tr>
</table>

<p> test3 </p>

<ul>
<li> test4 </li>
<ol>
<li> test5 </li>
</ol>
</ul>

<div id="test"> test6 </div>

<h3> test7 </h3>
8 changes: 4 additions & 4 deletions spec/reverse_adoc/components/lists_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
end

context "nested list with lots of whitespace" do
it { is_expected.to match /\n\* item wa \n/ }
it { is_expected.to match /\n\* item wb \n/ }
it { is_expected.to match /\n\*\* item wbb \n/ }
it { is_expected.to match /\n\*\* item wbc \n/ }
it { is_expected.to match /\n\* item wa\n/ }
it { is_expected.to match /\n\* item wb\n/ }
it { is_expected.to match /\n\*\* item wbb\n/ }
it { is_expected.to match /\n\*\* item wbc\n/ }
end

context "lists containing links" do
Expand Down
14 changes: 14 additions & 0 deletions spec/reverse_adoc/components/unicode_space_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
require "spec_helper"

describe Coradoc::ReverseAdoc do
let(:input) { File.read("spec/reverse_adoc/assets/unicode_space.html") }
let(:document) { Nokogiri::HTML(input) }
subject { Coradoc::ReverseAdoc.convert(input) }

it { should include "\n| test1 | | test2 | \n" }
it { should include "\ntest3\n" }
it { should include "\n* test4\n" }
it { should include "\n.. test5\n" }
it { should include "\ntest6\n" }
it { should include "\n==== test7\n" }
end
Loading