From 4d49be6a28dcfddce7e9c6580996079d6a00fcc7 Mon Sep 17 00:00:00 2001 From: Bart Butler Date: Thu, 21 May 2020 17:34:25 -0700 Subject: [PATCH] Accept DOMDocument input --- src/Html2Text.php | 103 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 78 insertions(+), 25 deletions(-) diff --git a/src/Html2Text.php b/src/Html2Text.php index 5153cef..7f7d977 100644 --- a/src/Html2Text.php +++ b/src/Html2Text.php @@ -8,6 +8,7 @@ public static function defaultOptions() { return array( 'ignore_errors' => false, 'drop_links' => false, + 'is_office_document' => null, // auto-detect ); } @@ -22,29 +23,62 @@ public static function defaultOptions() { * * * @param string $html the input HTML - * @param boolean $ignore_error Ignore xml parsing errors + * @param array $options Parsing options * @return string the HTML converted, as best as possible, to text * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument} */ public static function convert($html, $options = array()) { - if ($options === false || $options === true) { - // Using old style (< 1.0) of passing in options - $options = array('ignore_errors' => $options); + $options = static::processOptions($options); + + if (!isset($options['is_office_document'])) { + $options['is_office_document'] = static::isOfficeDocument($html); } - $options = array_merge(static::defaultOptions(), $options); + $html = static::cleanHtml($html, $options); + $doc = static::getDocument($html, $options); + return static::convertDocument($doc, $options); + } - // check all options are valid - foreach ($options as $key => $value) { - if (!in_array($key, array_keys(static::defaultOptions()))) { - throw new \InvalidArgumentException("Unknown html2text option '$key'"); - } - } + /** + * Tries to convert the given DOMDocument into a plain text format - best suited for + * e-mail display, etc. + * + *

In particular, it tries to maintain the following features: + *

+ * + * @param \DOMDocument $doc the input DOMDocument + * @param array $options Parsing options + * @return string the HTML converted, as best as possible, to text + * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument} + */ + public static function convertDocument($doc, $options = array()) { + + $options = static::processOptions($options); + + $output = static::iterateOverNode($doc, null, false, $options); + + // process output for whitespace/newlines + $output = static::processWhitespaceNewlines($output); + + return $output; + } + + /** + * HTML newline, entity, MSOffice namespace cleaning + * + * @param string $html + * @param array $options + * @return string cleaned HTML + */ + static function cleanHtml($html, $options = array()) { - $is_office_document = static::isOfficeDocument($html); + $options = static::processOptions($options); - if ($is_office_document) { + if ($options['is_office_document']) { // remove office namespace $html = str_replace(array("", ""), "", $html); } @@ -54,14 +88,31 @@ public static function convert($html, $options = array()) { $html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8"); } - $doc = static::getDocument($html, $options['ignore_errors']); + return $html; + } + + /** + * Options pre-processing + * + * @param array $options + * @return array processed options array with defaults applied + */ + static function processOptions($options) { + if ($options === false || $options === true) { + // Using old style (< 1.0) of passing in options + $options = array('ignore_errors' => $options); + } - $output = static::iterateOverNode($doc, null, false, $is_office_document, $options); + $options = array_merge(static::defaultOptions(), $options); - // process output for whitespace/newlines - $output = static::processWhitespaceNewlines($output); + // check all options are valid + foreach ($options as $key => $value) { + if (!in_array($key, array_keys(static::defaultOptions()))) { + throw new \InvalidArgumentException("Unknown html2text option '$key'"); + } + } - return $output; + return $options; } /** @@ -124,7 +175,7 @@ static function processWhitespaceNewlines($text) { $text = preg_replace("/[ \t]*\n/im", "\n", $text); // unarmor pre blocks - $text = static::fixNewLines($text); + $text = static::fixNewlines($text); // remove unnecessary empty lines $text = preg_replace("/\n\n\n*/im", "\n\n", $text); @@ -136,10 +187,12 @@ static function processWhitespaceNewlines($text) { * Parse HTML into a DOMDocument * * @param string $html the input HTML - * @param boolean $ignore_error Ignore xml parsing errors + * @param array $options Parsing options * @return \DOMDocument the parsed document tree */ - static function getDocument($html, $ignore_error = false) { + static function getDocument($html, $options = array()) { + + $options = static::processOptions($options); $doc = new \DOMDocument(); @@ -159,7 +212,7 @@ static function getDocument($html, $ignore_error = false) { $html = '' . $html . ''; } - if ($ignore_error) { + if ($options['ignore_errors']) { $doc->strictErrorChecking = false; $doc->recover = true; $doc->xmlStandalone = true; @@ -228,7 +281,7 @@ static function nextChildName($node) { return $nextName; } - static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) { + static function iterateOverNode($node, $prevName = null, $in_pre = false, $options) { if ($node instanceof \DOMText) { // Replace whitespace characters with a space (equivilant to \s) if ($in_pre) { @@ -302,7 +355,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of // To fix this, for any p element with a className of `MsoNormal` (the standard // classname in any Microsoft export or outlook for a paragraph that behaves // like a line return) we skip the first line returns and set the name to br. - if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') { + if ($options['is_office_document'] && $node->getAttribute('class') == 'MsoNormal') { $output = ""; $name = 'br'; break; @@ -349,7 +402,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of while ($n != null) { - $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options); + $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $options); // Pass current node name to next child, as previousSibling does not appear to get populated if ($n instanceof \DOMDocumentType