From 34ea77864a5f7e50de37c82c4f082d21f3fd84a7 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sat, 6 Jul 2024 09:11:08 +0900 Subject: [PATCH] Add position check for XML declaration ## Why? XML declaration must be the first item. https://www.w3.org/TR/2006/REC-xml11-20060816/#document ``` [1] document ::= ( prolog element Misc* ) - ( Char* RestrictedChar Char* ) ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-prolog ``` [22] prolog ::= XMLDecl Misc* (doctypedecl Misc*)? ``` https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-XMLDecl ``` [23] XMLDecl ::= '' ``` See: https://github.com/ruby/rexml/pull/161#discussion_r1666118193 --- lib/rexml/parsers/baseparser.rb | 32 +++++++++++++---------- test/parse/test_processing_instruction.rb | 17 ++++++++++++ 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 02759e70..bfa10358 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -635,21 +635,25 @@ def process_instruction(start_position) @source.position = start_position raise REXML::ParseException.new(message, @source) end - if @document_status.nil? and match_data[1] == "xml" - content = match_data[2] - version = VERSION.match(content) - version = version[1] unless version.nil? - encoding = ENCODING.match(content) - encoding = encoding[1] unless encoding.nil? - if need_source_encoding_update?(encoding) - @source.encoding = encoding - end - if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding - encoding = "UTF-16" + if match_data[1] == "xml" + if @document_status + raise ParseException.new("Malformed XML: XML declaration other than at the top of the document.", @source) + else + content = match_data[2] + version = VERSION.match(content) + version = version[1] unless version.nil? + encoding = ENCODING.match(content) + encoding = encoding[1] unless encoding.nil? + if need_source_encoding_update?(encoding) + @source.encoding = encoding + end + if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding + encoding = "UTF-16" + end + standalone = STANDALONE.match(content) + standalone = standalone[1] unless standalone.nil? + return [ :xmldecl, version, encoding, standalone ] end - standalone = STANDALONE.match(content) - standalone = standalone[1] unless standalone.nil? - return [ :xmldecl, version, encoding, standalone ] end [:processing_instruction, match_data[1], match_data[2]] end diff --git a/test/parse/test_processing_instruction.rb b/test/parse/test_processing_instruction.rb index f0c0c24e..54959f22 100644 --- a/test/parse/test_processing_instruction.rb +++ b/test/parse/test_processing_instruction.rb @@ -39,6 +39,23 @@ def test_garbage_text pi.content, ]) end + + def test_xml_declaration_not_at_document_start + exception = assert_raise(REXML::ParseException) do + parser = REXML::Parsers::BaseParser.new('') + while parser.has_next? + parser.pull + end + end + + assert_equal(<<~DETAIL.chomp, exception.to_s) + Malformed XML: XML declaration other than at the top of the document. + Line: 1 + Position: 25 + Last 80 unconsumed characters: + + DETAIL + end end end end