Skip to content

Commit

Permalink
[DomCrawler] ignore bad charsets
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolas-grekas committed Jan 31, 2022
1 parent 60d3640 commit 46e7bd2
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 14 deletions.
21 changes: 7 additions & 14 deletions Crawler.php
Original file line number Diff line number Diff line change
Expand Up @@ -156,24 +156,17 @@ public function addContent($content, $type = null)
return;
}

$charset = null;
if (false !== $pos = stripos($type, 'charset=')) {
$charset = substr($type, $pos + 8);
if (false !== $pos = strpos($charset, ';')) {
$charset = substr($charset, 0, $pos);
}
}
$charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';

// http://www.w3.org/TR/encoding/#encodings
// http://www.w3.org/TR/REC-xml/#NT-EncName
if (null === $charset &&
preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
$charset = $matches[1];
}
$content = preg_replace_callback('/(charset *= *["\']?)([a-zA-Z\-0-9_:.]+)/i', function ($m) use (&$charset) {
if ('charset=' === $this->convertToHtmlEntities('charset=', $m[2])) {
$charset = $m[2];
}

if (null === $charset) {
$charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1';
}
return $m[1].$charset;
}, $content, 1);

if ('x' === $xmlMatches[1]) {
$this->addXmlContent($content, $charset);
Expand Down
4 changes: 4 additions & 0 deletions Tests/AbstractCrawlerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,10 @@ public function testAddContent()
$crawler = $this->createCrawler();
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=utf-8" /><span>中文</span></html>');
$this->assertEquals('中文', $crawler->filterXPath('//span')->text(), '->addContent() guess wrong charset');

$crawler = $this->createCrawler();
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=unicode" /><div class="foo"></html></html>');
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset');
}

/**
Expand Down

0 comments on commit 46e7bd2

Please sign in to comment.