Skip to content

Commit

Permalink
Merge pull request #544 from osapon/master
Browse files Browse the repository at this point in the history
(fix): Improved recognition of text encoding
  • Loading branch information
oscarotero authored Nov 4, 2024
2 parents 85dccc6 + d6ac21f commit 1d6aa33
Show file tree
Hide file tree
Showing 7 changed files with 2,607 additions and 1 deletion.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@
.php_cs.dist export-ignore
.travis.yml export-ignore
phpunit.xml.dist export-ignore
/tests/cache/4pda.to.2022-12-04-406834-sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty.php working-tree-encoding=windows-1251 diff=windows-1251
/tests/cache/www.itmedia.co.jp.news-articles-2410-28-news159.html.php working-tree-encoding=sjis diff=sjis
13 changes: 12 additions & 1 deletion src/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,18 @@ public function __construct(Extractor $extractor)
$html = str_replace('<br>', "\n<br>", $html);
$html = str_replace('<br ', "\n<br ", $html);

$this->document = !empty($html) ? Parser::parse($html) : new DOMDocument();
$encoding = null;
$contentType = $extractor->getResponse()->getHeaderLine('content-type');
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $contentType, $match);
if (!empty($match[1])) {
$encoding = $match[1];
} elseif (!empty($html)) {
preg_match('/charset="?(.*?)(?=$|\s|;|")/i', $html, $match);
if (!empty($match[1])) {
$encoding = $match[1];
}
}
$this->document = !empty($html) ? Parser::parse($html, $encoding) : new DOMDocument();
$this->initXPath();
}

Expand Down
2 changes: 2 additions & 0 deletions tests/PagesTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public function urlDataProvider(): array
['http://www.ustream.tv/channel/red-shoes-billiards-60803-camera-1'],
['http://www.viddler.com/v/bdce8c7'],
['http://www.wired.com/?p=2064839'],
['https://www.itmedia.co.jp/news/articles/2410/28/news159.html'],
['https://4pda.to/2022/12/04/406834/sostoyalsya_reliz_clown_of_duty_parodii_na_call_of_duty/'],
];
}

Expand Down

Large diffs are not rendered by default.

Loading

0 comments on commit 1d6aa33

Please sign in to comment.