Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept DOMDocument input #81

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
103 changes: 78 additions & 25 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public static function defaultOptions() {
return array(
'ignore_errors' => false,
'drop_links' => false,
'is_office_document' => null, // auto-detect
);
}

Expand All @@ -22,29 +23,62 @@ public static function defaultOptions() {
* </ul>
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @param array $options Parsing options
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $options = array()) {

if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
$options = static::processOptions($options);

if (!isset($options['is_office_document'])) {
$options['is_office_document'] = static::isOfficeDocument($html);
}

$options = array_merge(static::defaultOptions(), $options);
$html = static::cleanHtml($html, $options);
$doc = static::getDocument($html, $options);
return static::convertDocument($doc, $options);
}

// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}
/**
* Tries to convert the given DOMDocument into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param \DOMDocument $doc the input DOMDocument
* @param array $options Parsing options
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convertDocument($doc, $options = array()) {

$options = static::processOptions($options);

$output = static::iterateOverNode($doc, null, false, $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);

return $output;
}

/**
* HTML newline, entity, MSOffice namespace cleaning
*
* @param string $html
* @param array $options
* @return string cleaned HTML
*/
static function cleanHtml($html, $options = array()) {

$is_office_document = static::isOfficeDocument($html);
$options = static::processOptions($options);

if ($is_office_document) {
if ($options['is_office_document']) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
Expand All @@ -54,14 +88,31 @@ public static function convert($html, $options = array()) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}

$doc = static::getDocument($html, $options['ignore_errors']);
return $html;
}

/**
* Options pre-processing
*
* @param array $options
* @return array processed options array with defaults applied
*/
static function processOptions($options) {
if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
}

$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
$options = array_merge(static::defaultOptions(), $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}

return $output;
return $options;
}

/**
Expand Down Expand Up @@ -124,7 +175,7 @@ static function processWhitespaceNewlines($text) {
$text = preg_replace("/[ \t]*\n/im", "\n", $text);

// unarmor pre blocks
$text = static::fixNewLines($text);
$text = static::fixNewlines($text);

// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
Expand All @@ -136,10 +187,12 @@ static function processWhitespaceNewlines($text) {
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @param array $options Parsing options
* @return \DOMDocument the parsed document tree
*/
static function getDocument($html, $ignore_error = false) {
static function getDocument($html, $options = array()) {

$options = static::processOptions($options);

$doc = new \DOMDocument();

Expand All @@ -159,7 +212,7 @@ static function getDocument($html, $ignore_error = false) {
$html = '<body>' . $html . '</body>';
}

if ($ignore_error) {
if ($options['ignore_errors']) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
Expand Down Expand Up @@ -228,7 +281,7 @@ static function nextChildName($node) {
return $nextName;
}

static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
static function iterateOverNode($node, $prevName = null, $in_pre = false, $options) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
Expand Down Expand Up @@ -302,7 +355,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
if ($options['is_office_document'] && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
Expand Down Expand Up @@ -349,7 +402,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of

while ($n != null) {

$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $options);

// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
Expand Down