diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 1184403f7b..b08856e838 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -61,6 +61,7 @@ require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_EPub_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_WXR_Entity_Reader.php'; require_once __DIR__ . '/src/entity-readers/WP_Directory_Tree_Entity_Reader.php'; diff --git a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php index 329e75bc39..6aceeb9c53 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php +++ b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php @@ -28,31 +28,29 @@ class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { private $state = self::STATE_READY; private $block_stack = array(); - private $html; + private $markup_processor; private $ignore_text = false; private $in_ephemeral_paragraph = false; private $block_markup = ''; private $metadata = array(); + private $last_error = null; - public function __construct( $html ) { - $this->html = WP_HTML_Processor::create_fragment( $html ); + public function __construct( $markup_processor ) { + $this->markup_processor = $markup_processor; } - /** - * @inheritDoc - */ public function convert() { if ( self::STATE_READY !== $this->state ) { return false; } - while ( $this->html->next_token() ) { - switch ( $this->html->get_token_type() ) { + while ( $this->markup_processor->next_token() ) { + switch ( $this->markup_processor->get_token_type() ) { case '#text': if ( $this->ignore_text ) { break; } - $this->append_html( htmlspecialchars( $this->html->get_modifiable_text() ) ); + $this->append_rich_text( htmlspecialchars( $this->markup_processor->get_modifiable_text() ) ); break; case '#tag': $this->handle_tag(); @@ -60,13 +58,16 @@ public function convert() { } } + if ( $this->markup_processor->get_last_error() ) { + $this->last_error = $this->markup_processor->get_last_error(); + return false; + } + $this->close_ephemeral_paragraph(); + return true; } - /** - * @inheritDoc - */ public function get_first_meta_value( $key ) { if ( ! array_key_exists( $key, $this->metadata ) ) { return null; @@ -74,231 +75,204 @@ public function get_first_meta_value( $key ) { return $this->metadata[ $key ][0]; } - /** - * @inheritDoc - */ public function get_all_metadata() { return $this->metadata; } - /** - * @inheritDoc - */ public function get_block_markup() { return $this->block_markup; } - /** - * Converts the currently matched HTML tag to block markup - * or metadata. - */ private function handle_tag() { - $html = $this->html; - $tag = $html->get_tag(); + $html = $this->markup_processor; + $tag = strtoupper( $html->get_tag() ); $tag_lowercase = strtolower( $tag ); - $is_opener = ! $html->is_tag_closer() && $html->expects_closer(); - $is_closer = $html->is_tag_closer(); - $is_void_tag = ! $html->expects_closer(); - $prefix = ( - $is_void_tag ? '' : ( - $is_closer ? '-' : '+' - ) - ); - $event = $prefix . $tag; - switch ( $event ) { - case 'META': - $key = $html->get_attribute( 'name' ); - $value = $html->get_attribute( 'content' ); - if ( ! array_key_exists( $key, $this->metadata ) ) { - $this->metadata[ $key ] = array(); - } - $this->metadata[ $key ][] = $value; - break; - case 'IMG': - $template = new \WP_HTML_Tag_Processor( '' ); - $template->next_tag(); - foreach ( array( 'alt', 'title', 'src' ) as $attr ) { - if ( $html->get_attribute( $attr ) ) { - $template->set_attribute( $attr, $html->get_attribute( $attr ) ); + $is_void_tag = ! $html->expects_closer() && ! $html->is_tag_closer(); + if ( $is_void_tag ) { + switch ( $tag ) { + case 'META': + $key = $html->get_attribute( 'name' ); + $value = $html->get_attribute( 'content' ); + if ( ! array_key_exists( $key, $this->metadata ) ) { + $this->metadata[ $key ] = array(); } - } - $this->append_html( $template->get_updated_html() ); - break; - case 'INPUT': - // Insert the input tag as HTML blocks. - $this->push_block( 'html' ); - $template = new \WP_HTML_Tag_Processor( '' ); - $template->next_tag(); - $attrs = $this->html->get_attribute_names_with_prefix( '' ); - foreach ( $attrs as $attr ) { - $template->set_attribute( $attr, $this->html->get_attribute( $attr ) ); - } - $this->append_html( htmlspecialchars( $template->get_updated_html() ) ); - $this->pop_block(); - break; - case 'HR': - $this->push_block( 'separator' ); - $this->block_markup .= '
'; - $this->pop_block(); - break; - - // Block elements - case '+SCRIPT': - $this->ignore_text = true; - break; - case '-SCRIPT': - $this->ignore_text = false; - break; - - case '+UL': - case '+OL': - $this->push_block( 'list', array( 'ordered' => $tag === 'ol' ) ); - $this->block_markup .= ''; - $this->pop_block(); - break; + $this->metadata[ $key ][] = $value; + break; + case 'IMG': + $template = new \WP_HTML_Tag_Processor( '' ); + $template->next_tag(); + foreach ( array( 'alt', 'title', 'src' ) as $attr ) { + if ( $html->get_attribute( $attr ) ) { + $template->set_attribute( $attr, $html->get_attribute( $attr ) ); + } + } + /** + * + */ + $this->append_rich_text( $template->get_updated_html() ); + break; + default: + // @TODO: What to do with other void tags, e.g. ? + // Just insert an HTML block or what? + break; + } + } elseif ( ! $html->is_tag_closer() ) { + switch ( $tag ) { + // Block elements + case 'SCRIPT': + $this->ignore_text = true; + break; + case 'UL': + case 'OL': + $this->push_block( 'list', array( 'ordered' => $tag === 'ol' ) ); + $this->block_markup .= ''; $this->pop_block(); - } - break; - - case '+P': - $this->push_block( 'paragraph' ); - $this->block_markup .= '

'; - break; - case '-P': - $this->block_markup .= '

'; - $this->pop_block(); - break; + break; - case '+H1': - case '+H2': - case '+H3': - case '+H4': - case '+H5': - case '+H6': - $this->push_block( - 'heading', - array( - 'level' => (int) $tag[1] ? (int) $tag[1] : 1, - ) - ); - $this->block_markup .= ''; - break; - case '-H1': - case '-H2': - case '-H3': - case '-H4': - case '-H5': - case '-H6': - $this->block_markup .= ''; - $this->pop_block(); - break; + case 'LI': + case 'BLOCKQUOTE': + case 'PRE': + case 'HR': + case 'P': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + $this->block_markup .= ''; + $this->pop_block(); + break; - // Inline elements - case '+A': - $template = new \WP_HTML_Tag_Processor( '
' ); - $template->next_tag(); - if ( $html->get_attribute( 'href' ) ) { - $template->set_attribute( 'href', $html->get_attribute( 'href' ) ); - } - $this->append_html( $template->get_updated_html() ); - break; - case '-A': - $this->block_markup .= ''; - break; + case 'A': + $this->block_markup .= ''; + break; - // Formats – just pass through (minus the HTML attributes) - default: - if ( $this->should_preserve_tag_in_rich_text( $tag ) ) { - if ( $is_opener ) { - $this->append_html( '<' . $tag_lowercase . '>' ); - } elseif ( $is_closer ) { - $this->append_html( '' ); + // Formats + default: + if ( $this->should_preserve_tag_in_rich_text( $tag ) ) { + $this->block_markup .= ''; } - } else { - /* - * Ignore all the other tags. We've included all the meaningful - * handlers in the switch statement above and there's not much - * we can do with generic tags such as
, ,
, etc. - */ - } - break; + break; + } } } @@ -337,7 +311,7 @@ private function should_preserve_tag_in_rich_text( $tag ) { } private function is_at_inline_code_element() { - $breadcrumbs = $this->html->get_breadcrumbs(); + $breadcrumbs = $this->markup_processor->get_breadcrumbs(); foreach ( $breadcrumbs as $tag ) { switch ( $tag ) { case 'A': @@ -363,12 +337,12 @@ private function is_at_inline_code_element() { * * @param string $html The HTML snippet to append. */ - private function append_html( $html ) { + private function append_rich_text( $html ) { $html = trim( $html ); if ( empty( $html ) ) { return; } - // Make sure two subsequent append_html() calls don't merge the text. + // Make sure two subsequent append_text() calls don't merge the text. $html .= ' '; $this->ensure_open_block(); $this->block_markup .= $html; @@ -425,4 +399,8 @@ private function close_ephemeral_paragraph() { $this->in_ephemeral_paragraph = false; } } + + public function get_last_error() { + return $this->last_error; + } } diff --git a/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php new file mode 100644 index 0000000000..db7b8b9df3 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_EPub_Entity_Reader.php @@ -0,0 +1,123 @@ +zip = $zip; + $this->current_post_id = $first_post_id; + } + + public function next_entity() { + if ( $this->last_error ) { + return false; + } + + if ( $this->finished ) { + return false; + } + + if ( null === $this->remaining_html_files ) { + $path = false; + foreach ( array( '/OEBPS', '/EPUB' ) as $path_candidate ) { + if ( $this->zip->is_dir( $path_candidate ) ) { + $path = $path_candidate; + break; + } + } + if ( false === $path ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + + $files = $this->zip->ls( $path ); + if ( false === $files ) { + _doing_it_wrong( __METHOD__, 'The EPUB file did not contain any HTML files.', '1.0.0' ); + $this->finished = true; + return false; + } + $this->remaining_html_files = array(); + foreach ( $files as $file ) { + if ( str_ends_with( $file, '.xhtml' ) || str_ends_with( $file, '.html' ) ) { + $this->remaining_html_files[] = $path . '/' . $file; + } + } + } + + while ( true ) { + if ( null !== $this->current_html_reader ) { + if ( + ! $this->current_html_reader->is_finished() && + $this->current_html_reader->next_entity() + ) { + return true; + } + if ( $this->current_html_reader->get_last_error() ) { + _doing_it_wrong( + __METHOD__, + 'The EPUB file did not contain any HTML files.', + '1.0.0' + ); + $this->finished = true; + return false; + } + } + + if ( count( $this->remaining_html_files ) === 0 ) { + $this->finished = true; + return false; + } + + $html_file = array_shift( $this->remaining_html_files ); + $html = $this->zip->read_file( $html_file ); + $this->current_html_reader = new WP_HTML_Entity_Reader( + WP_XML_Processor::create_from_string( $html ), + $this->current_post_id + ); + if ( $this->current_html_reader->get_last_error() ) { + $this->last_error = $this->current_html_reader->get_last_error(); + return false; + } + ++$this->current_post_id; + } + + return false; + } + + public function get_entity() { + return $this->current_html_reader->get_entity(); + } + + public function is_finished(): bool { + return $this->finished; + } + + public function get_last_error(): ?string { + return $this->last_error; + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php index 95923ef390..f500d698de 100644 --- a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -5,18 +5,7 @@ */ class WP_HTML_Entity_Reader extends WP_Entity_Reader { - /** - * The HTML document to convert. - * - * @var string - */ - protected $html; - - /** - * The emitted entities. - * - * @var array - */ + protected $html_processor; protected $entities; /** @@ -32,16 +21,11 @@ class WP_HTML_Entity_Reader extends WP_Entity_Reader { * @var int */ protected $post_id; + protected $last_error; - /** - * Constructs the reader. - * - * @param string $html The HTML document to convert. - * @param int $post_id The ID to use as `post_id` of the emitted post entity. - */ - public function __construct( $html, $post_id ) { - $this->html = $html; - $this->post_id = $post_id; + public function __construct( $html_processor, $post_id ) { + $this->html_processor = $html_processor; + $this->post_id = $post_id; } /** @@ -66,8 +50,9 @@ public function next_entity() { } // We did not read any entities yet. Let's convert the HTML document into entities. - $converter = new WP_HTML_To_Blocks( $this->html ); + $converter = new WP_HTML_To_Blocks( $this->html_processor ); if ( false === $converter->convert() ) { + $this->last_error = $converter->get_last_error(); return false; } @@ -135,6 +120,6 @@ public function is_finished(): bool { * @return string|null The last error, or null if there was no error. */ public function get_last_error(): ?string { - return null; + return $this->last_error; } } diff --git a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php index 881e689020..b6b2a7669e 100644 --- a/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php +++ b/packages/playground/data-liberation/src/xml-api/WP_XML_Processor.php @@ -1558,7 +1558,6 @@ private function parse_next_tag() { * See https://www.w3.org/TR/xml11.xml/#sec-cdata-sect */ if ( - ! $this->is_closing_tag && $doc_length > $this->token_starts_at + 8 && '[' === $xml[ $this->token_starts_at + 2 ] && 'C' === $xml[ $this->token_starts_at + 3 ] && @@ -1583,6 +1582,59 @@ private function parse_next_tag() { return true; } + /* + * Identify DOCTYPE nodes. + * + * See https://www.w3.org/TR/xml11.html/#dtd + */ + if ( + $doc_length > $this->token_starts_at + 8 && + 'D' === $xml[ $at + 2 ] && + 'O' === $xml[ $at + 3 ] && + 'C' === $xml[ $at + 4 ] && + 'T' === $xml[ $at + 5 ] && + 'Y' === $xml[ $at + 6 ] && + 'P' === $xml[ $at + 7 ] && + 'E' === $xml[ $at + 8 ] + ) { + $at += 9; + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + + return false; + } + + // @TODO: Expose the "name" value instead of skipping it like that + $at += $this->parse_name( $at ); + + // Skip whitespace. + $at += strspn( $this->xml, " \t\f\r\n", $at ); + + if ( $doc_length <= $at ) { + $this->mark_incomplete_input( 'Unclosed DOCTYPE declaration.' ); + return false; + } + + if ( $this->xml[ $at ] !== '>' ) { + $this->last_error = self::ERROR_SYNTAX; + _doing_it_wrong( + __METHOD__, + __( 'Unsupported DOCTYPE syntax. Only a simple is supported.' ), + 'WP_VERSION' + ); + return false; + } + + $closer_at = $at; + $this->parser_state = self::STATE_DOCTYPE_NODE; + $this->token_length = $closer_at + 1 - $this->token_starts_at; + $this->bytes_already_parsed = $closer_at + 1; + return true; + } + /* * Anything else here is either unsupported at this point or invalid * syntax. See the class-level @TODO annotations for more information. @@ -2471,6 +2523,22 @@ public function get_tag() { return null; } + /** + * Indicates if the currently matched tag is expected to be closed. + * Returns true for tag openers (
) and false for empty elements () and tag closers (
). + * + * This method exists to provide a consistent interface with WP_HTML_Processor. + * + * @return bool Whether the tag is expected to be closed. + */ + public function expects_closer() { + if ( self::STATE_MATCHED_TAG !== $this->parser_state ) { + return false; + } + + return $this->is_tag_opener() && ! $this->is_empty_element(); + } + /** * Indicates if the currently matched tag is an empty element tag. * @@ -2604,6 +2672,9 @@ public function get_token_name() { case self::STATE_CDATA_NODE: return '#cdata-section'; + case self::STATE_DOCTYPE_NODE: + return '#doctype'; + case self::STATE_XML_DECLARATION: return '#xml-declaration'; @@ -3030,10 +3101,11 @@ private function step_in_prolog( $node_to_process = self::PROCESS_NEXT_NODE ) { $this->last_error = self::ERROR_SYNTAX; _doing_it_wrong( __METHOD__, 'Unexpected token type in prolog stage.', 'WP_VERSION' ); } - return $this->step(); - case '#xml-declaration': + // @TODO: Fail if there's more than one or if was found before the XML declaration token. + case '#doctype': case '#comment': + case '#xml-declaration': case '#processing-instructions': return true; case '#tag': @@ -3393,6 +3465,18 @@ private function mark_incomplete_input( */ const STATE_CDATA_NODE = 'STATE_CDATA_NODE'; + /** + * Parser DOCTYPE Node State. + * + * Indicates that the parser has found a DOCTYPE declaration and it's possible + * to read and modify its modifiable text. + * + * @since WP_VERSION + * + * @access private + */ + const STATE_DOCTYPE_NODE = 'STATE_DOCTYPE_NODE'; + /** * Indicates that the parser has found an XML processing instruction. * diff --git a/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php new file mode 100644 index 0000000000..c6bf17248c --- /dev/null +++ b/packages/playground/data-liberation/tests/WPEPubEntityReaderTests.php @@ -0,0 +1,40 @@ +next_entity() ) { + $data = $reader->get_entity()->get_data(); + $entities[] = [ + 'type' => $reader->get_entity()->get_type(), + 'data' => $data, + ]; + } + $this->assertNull( $reader->get_last_error() ); + $this->assertEquals( 3, count($entities) ); + $this->assertGreaterThan( 100, strlen($entities[0]['data']['content']) ); + $this->assertGreaterThan( 1000, strlen($entities[1]['data']['content']) ); + $this->assertGreaterThan( 1000, strlen($entities[2]['data']['content']) ); + echo $entities[2]['data']['content']; + } + + public function epub_byte_reader_data_provider() { + return [ + 'Local file' => [ + \WordPress\ByteReader\WP_File_Reader::create( __DIR__ . '/fixtures/epub-entity-reader/childrens-literature.epub' ) + ], + 'Remote file' => [ + \WordPress\ByteReader\WP_Remote_File_Ranged_Reader::create( 'https://github.com/IDPF/epub3-samples/releases/download/20230704/childrens-literature.epub' ) + ], + ]; + } + +} diff --git a/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php index 0385dc8358..c8e32fc573 100644 --- a/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php @@ -13,7 +13,7 @@ public function test_entity_reader() {

It is our pleasure to announce that WordPress 6.8 was released

Last week, WordPress 6.8 was released.

HTML; - $reader = new WP_HTML_Entity_Reader( $html, 1 ); + $reader = new WP_HTML_Entity_Reader( new WP_HTML_Processor( $html ), 1 ); $entities = []; while ( $reader->next_entity() ) { $data = $reader->get_entity()->get_data(); diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php index 41d6ba8ae8..d3daef742e 100644 --- a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -16,7 +16,7 @@ public function test_metadata_extraction() {

WordPress 6.8 was released

Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.

HTML; - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $metadata = $converter->get_all_metadata(); $expected_metadata = [ @@ -35,7 +35,7 @@ public function test_metadata_extraction() { * @dataProvider provider_test_conversion */ public function test_html_to_blocks_conversion( $html, $expected ) { - $converter = new WP_HTML_To_Blocks( $html ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $html ) ); $converter->convert( $html ); $blocks = $converter->get_block_markup(); @@ -89,16 +89,12 @@ public function provider_test_conversion() { ], 'Formatted text' => [ 'html' => '

Bold and Italic

', - 'expected' => "

Bold and Italic

" + 'expected' => "

Bold and Italic

" ], 'A blockquote' => [ 'html' => '
A simple blockquote
', 'expected' => "
A simple blockquote
" ], - 'A an tag' => [ - 'html' => '', - 'expected' => "<input type="text" value="A simple input"> " - ], 'A table' => [ 'html' => << @@ -135,7 +131,7 @@ public function provider_test_conversion() { public function test_html_to_blocks_excerpt() { $input = file_get_contents( __DIR__ . '/fixtures/html-to-blocks/excerpt.input.html' ); - $converter = new WP_HTML_To_Blocks( $input ); + $converter = new WP_HTML_To_Blocks( new WP_HTML_Processor( $input ) ); $converter->convert( $input ); $blocks = $converter->get_block_markup(); @@ -145,7 +141,29 @@ public function test_html_to_blocks_excerpt() { } $this->assertEquals( file_get_contents( $output_file ), $blocks ); - + } + + public function test_xhtml_to_blocks_conversion() { + $input = << + + + +

Hello, world!

+

And some content

+ + +XML; + $converter = new WP_HTML_To_Blocks( WP_XML_Processor::create_from_string( $input ) ); + $converter->convert( $input ); + $blocks = $converter->get_block_markup(); + $expected = <<

Hello, world!

And some content

+HTML; + $this->assertEquals( + $this->normalize_markup( $expected ), + $this->normalize_markup( $blocks ) + ); } } diff --git a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php index 2c3646dada..0e1dbf1ec4 100644 --- a/packages/playground/data-liberation/tests/WPXMLProcessorTests.php +++ b/packages/playground/data-liberation/tests/WPXMLProcessorTests.php @@ -1749,4 +1749,46 @@ public function test_pause_and_resume() { $this->assertEquals( 'Hello there', $resumed->get_modifiable_text(), 'Did not find the expected text.' ); } -} \ No newline at end of file + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_token + */ + public function test_doctype_parsing() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $this->assertTrue( $processor->next_token(), 'Did not find DOCTYPE node' ); + $this->assertEquals( '#doctype', $processor->get_token_type(), 'Did not find DOCTYPE node' ); + $this->assertTrue( $processor->next_token(), 'Did not find root tag' ); + $this->assertEquals( 'root', $processor->get_tag(), 'Did not find root tag' ); + } + + /** + * @ticket 61365 + * + * @covers WP_XML_Processor::next_token + */ + public function test_unsupported_doctype_parsing() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $this->assertFalse( $processor->next_token(), 'Did not reject complex DOCTYPE' ); + $this->assertEquals( 'syntax', $processor->get_last_error(), 'Did not set syntax error' ); + } + + public function test_doctype_in_tag_content_is_syntax_error() { + $processor = WP_XML_Processor::create_from_string( + 'Content' + ); + + $processor->next_token(); + $processor->next_token(); + + $this->assertFalse( $processor->next_token(), 'Did not reject DOCTYPE in tag content' ); + $this->assertEquals( 'syntax', $processor->get_last_error(), 'Did not set syntax error' ); + } + +} diff --git a/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub new file mode 100644 index 0000000000..ba84a64399 Binary files /dev/null and b/packages/playground/data-liberation/tests/fixtures/epub-entity-reader/childrens-literature.epub differ diff --git a/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html index 2f4d9be782..e69de29bb2 100644 --- a/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html +++ b/packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html @@ -1,2887 +0,0 @@ - -

WHATWG

-

HTML

- -

Living Standard — Last Updated 12 December 2024

- -

One-Page Version html.spec.whatwg.org Multipage Version /multipage Version for Web Devs /dev PDF Version /print.pdf Translations 日本語 • 简体中文 FAQ on GitHub Chat on Matrix Contribute on GitHub whatwg/html repository Commits on GitHub Snapshot as of this commit Twitter Updates @htmlstandard Open Issues filed on GitHub Open an Issue whatwg.org/newbug Tests web-platform-tests html/ Issues for Tests ongoing work

-
- -

Table of contents

- - - -

Full table of contents

- - - -

1 Introduction

- -

1.1 Where does this specification fit?

- -

This specification defines a big part of the web platform, in lots of detail. Its place in the - web platform specification stack relative to other specifications can be best summed up as - follows:

- -

1.2 Is this HTML5?

- -

This section is non-normative.

- -

In short: Yes.

- -

In more length: the term "HTML5" is widely used as a buzzword to refer to modern web - technologies, many of which (though by no means all) are developed at the WHATWG. This document is - one such; others are available from the WHATWG Standards - overview .

- -

1.3 Background

- -

This section is non-normative.

- -

HTML is the World Wide Web's core markup language. Originally, HTML was primarily designed as a - language for semantically describing scientific documents. Its general design, however, has - enabled it to be adapted, over the subsequent years, to describe a number of other types of - documents and even applications.

- -

1.4 Audience

- -

This section is non-normative.

- -

This specification is intended for authors of documents and scripts that use the features - defined in this specification, implementers of tools that operate on pages that - use the features defined in this specification, and individuals wishing to establish the - correctness of documents or implementations with respect to the requirements of this - specification.

- -

This document is probably not suited to readers who do not already have at least a passing - familiarity with web technologies, as in places it sacrifices clarity for precision, and brevity - for completeness. More approachable tutorials and authoring guides can provide a gentler - introduction to the topic.

- -

In particular, familiarity with the basics of DOM is necessary for a complete understanding of - some of the more technical parts of this specification. An understanding of Web IDL, HTTP, XML, - Unicode, character encodings, JavaScript, and CSS will also be helpful in places but is not - essential.

- -

1.5 Scope

- -

This section is non-normative.

- -

This specification is limited to providing a semantic-level markup language and associated - semantic-level scripting APIs for authoring accessible pages on the web ranging from static - documents to dynamic applications.

- -

The scope of this specification does not include providing mechanisms for media-specific - customization of presentation (although default rendering rules for web browsers are included at - the end of this specification, and several mechanisms for hooking into CSS are provided as part of - the language).

- -

The scope of this specification is not to describe an entire operating system. In particular, - hardware configuration software, image manipulation tools, and applications that users would be - expected to use with high-end workstations on a daily basis are out of scope. In terms of - applications, this specification is targeted specifically at applications that would be expected - to be used by users on an occasional basis, or regularly but from disparate locations, with low - CPU requirements. Examples of such applications include online purchasing systems, searching - systems, games (especially multiplayer online games), public telephone books or address books, - communications software (email clients, instant messaging clients, discussion software), document - editing software, etc.

- -

1.6 History

- -

This section is non-normative.

- -

For its first five years (1990-1995), HTML went through a number of revisions and experienced a