Skip to content

Commit

Permalink
Accept DOMDocument input
Browse files Browse the repository at this point in the history
  • Loading branch information
Bart Butler committed May 28, 2020
1 parent 3243a71 commit 4d49be6
Showing 1 changed file with 78 additions and 25 deletions.
103 changes: 78 additions & 25 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ public static function defaultOptions() {
return array(
'ignore_errors' => false,
'drop_links' => false,
'is_office_document' => null, // auto-detect
);
}

Expand All @@ -22,29 +23,62 @@ public static function defaultOptions() {
* </ul>
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @param array $options Parsing options
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $options = array()) {

if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
$options = static::processOptions($options);

if (!isset($options['is_office_document'])) {
$options['is_office_document'] = static::isOfficeDocument($html);
}

$options = array_merge(static::defaultOptions(), $options);
$html = static::cleanHtml($html, $options);
$doc = static::getDocument($html, $options);
return static::convertDocument($doc, $options);
}

// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}
/**
* Tries to convert the given DOMDocument into a plain text format - best suited for
* e-mail display, etc.
*
* <p>In particular, it tries to maintain the following features:
* <ul>
* <li>Links are maintained, with the 'href' copied over
* <li>Information in the &lt;head&gt; is lost
* </ul>
*
* @param \DOMDocument $doc the input DOMDocument
* @param array $options Parsing options
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convertDocument($doc, $options = array()) {

$options = static::processOptions($options);

$output = static::iterateOverNode($doc, null, false, $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);

return $output;
}

/**
* HTML newline, entity, MSOffice namespace cleaning
*
* @param string $html
* @param array $options
* @return string cleaned HTML
*/
static function cleanHtml($html, $options = array()) {

$is_office_document = static::isOfficeDocument($html);
$options = static::processOptions($options);

if ($is_office_document) {
if ($options['is_office_document']) {
// remove office namespace
$html = str_replace(array("<o:p>", "</o:p>"), "", $html);
}
Expand All @@ -54,14 +88,31 @@ public static function convert($html, $options = array()) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}

$doc = static::getDocument($html, $options['ignore_errors']);
return $html;
}

/**
* Options pre-processing
*
* @param array $options
* @return array processed options array with defaults applied
*/
static function processOptions($options) {
if ($options === false || $options === true) {
// Using old style (< 1.0) of passing in options
$options = array('ignore_errors' => $options);
}

$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
$options = array_merge(static::defaultOptions(), $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
// check all options are valid
foreach ($options as $key => $value) {
if (!in_array($key, array_keys(static::defaultOptions()))) {
throw new \InvalidArgumentException("Unknown html2text option '$key'");
}
}

return $output;
return $options;
}

/**
Expand Down Expand Up @@ -124,7 +175,7 @@ static function processWhitespaceNewlines($text) {
$text = preg_replace("/[ \t]*\n/im", "\n", $text);

// unarmor pre blocks
$text = static::fixNewLines($text);
$text = static::fixNewlines($text);

// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
Expand All @@ -136,10 +187,12 @@ static function processWhitespaceNewlines($text) {
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param boolean $ignore_error Ignore xml parsing errors
* @param array $options Parsing options
* @return \DOMDocument the parsed document tree
*/
static function getDocument($html, $ignore_error = false) {
static function getDocument($html, $options = array()) {

$options = static::processOptions($options);

$doc = new \DOMDocument();

Expand All @@ -159,7 +212,7 @@ static function getDocument($html, $ignore_error = false) {
$html = '<body>' . $html . '</body>';
}

if ($ignore_error) {
if ($options['ignore_errors']) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
Expand Down Expand Up @@ -228,7 +281,7 @@ static function nextChildName($node) {
return $nextName;
}

static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
static function iterateOverNode($node, $prevName = null, $in_pre = false, $options) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
Expand Down Expand Up @@ -302,7 +355,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
if ($options['is_office_document'] && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
Expand Down Expand Up @@ -349,7 +402,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of

while ($n != null) {

$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $options);

// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
Expand Down

0 comments on commit 4d49be6

Please sign in to comment.