diff --git a/src/Html2Text.php b/src/Html2Text.php
index 5153cef..7f7d977 100644
--- a/src/Html2Text.php
+++ b/src/Html2Text.php
@@ -8,6 +8,7 @@ public static function defaultOptions() {
return array(
'ignore_errors' => false,
'drop_links' => false,
+ 'is_office_document' => null, // auto-detect
);
}
@@ -22,29 +23,62 @@ public static function defaultOptions() {
*
*
* @param string $html the input HTML
- * @param boolean $ignore_error Ignore xml parsing errors
+ * @param array $options Parsing options
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
public static function convert($html, $options = array()) {
- if ($options === false || $options === true) {
- // Using old style (< 1.0) of passing in options
- $options = array('ignore_errors' => $options);
+ $options = static::processOptions($options);
+
+ if (!isset($options['is_office_document'])) {
+ $options['is_office_document'] = static::isOfficeDocument($html);
}
- $options = array_merge(static::defaultOptions(), $options);
+ $html = static::cleanHtml($html, $options);
+ $doc = static::getDocument($html, $options);
+ return static::convertDocument($doc, $options);
+ }
- // check all options are valid
- foreach ($options as $key => $value) {
- if (!in_array($key, array_keys(static::defaultOptions()))) {
- throw new \InvalidArgumentException("Unknown html2text option '$key'");
- }
- }
+ /**
+ * Tries to convert the given DOMDocument into a plain text format - best suited for
+ * e-mail display, etc.
+ *
+ *
In particular, it tries to maintain the following features:
+ *
+ * - Links are maintained, with the 'href' copied over
+ *
- Information in the <head> is lost
+ *
+ *
+ * @param \DOMDocument $doc the input DOMDocument
+ * @param array $options Parsing options
+ * @return string the HTML converted, as best as possible, to text
+ * @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
+ */
+ public static function convertDocument($doc, $options = array()) {
+
+ $options = static::processOptions($options);
+
+ $output = static::iterateOverNode($doc, null, false, $options);
+
+ // process output for whitespace/newlines
+ $output = static::processWhitespaceNewlines($output);
+
+ return $output;
+ }
+
+ /**
+ * HTML newline, entity, MSOffice namespace cleaning
+ *
+ * @param string $html
+ * @param array $options
+ * @return string cleaned HTML
+ */
+ static function cleanHtml($html, $options = array()) {
- $is_office_document = static::isOfficeDocument($html);
+ $options = static::processOptions($options);
- if ($is_office_document) {
+ if ($options['is_office_document']) {
// remove office namespace
$html = str_replace(array("", ""), "", $html);
}
@@ -54,14 +88,31 @@ public static function convert($html, $options = array()) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}
- $doc = static::getDocument($html, $options['ignore_errors']);
+ return $html;
+ }
+
+ /**
+ * Options pre-processing
+ *
+ * @param array $options
+ * @return array processed options array with defaults applied
+ */
+ static function processOptions($options) {
+ if ($options === false || $options === true) {
+ // Using old style (< 1.0) of passing in options
+ $options = array('ignore_errors' => $options);
+ }
- $output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
+ $options = array_merge(static::defaultOptions(), $options);
- // process output for whitespace/newlines
- $output = static::processWhitespaceNewlines($output);
+ // check all options are valid
+ foreach ($options as $key => $value) {
+ if (!in_array($key, array_keys(static::defaultOptions()))) {
+ throw new \InvalidArgumentException("Unknown html2text option '$key'");
+ }
+ }
- return $output;
+ return $options;
}
/**
@@ -124,7 +175,7 @@ static function processWhitespaceNewlines($text) {
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
// unarmor pre blocks
- $text = static::fixNewLines($text);
+ $text = static::fixNewlines($text);
// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
@@ -136,10 +187,12 @@ static function processWhitespaceNewlines($text) {
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
- * @param boolean $ignore_error Ignore xml parsing errors
+ * @param array $options Parsing options
* @return \DOMDocument the parsed document tree
*/
- static function getDocument($html, $ignore_error = false) {
+ static function getDocument($html, $options = array()) {
+
+ $options = static::processOptions($options);
$doc = new \DOMDocument();
@@ -159,7 +212,7 @@ static function getDocument($html, $ignore_error = false) {
$html = '' . $html . '';
}
- if ($ignore_error) {
+ if ($options['ignore_errors']) {
$doc->strictErrorChecking = false;
$doc->recover = true;
$doc->xmlStandalone = true;
@@ -228,7 +281,7 @@ static function nextChildName($node) {
return $nextName;
}
- static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_office_document = false, $options) {
+ static function iterateOverNode($node, $prevName = null, $in_pre = false, $options) {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
@@ -302,7 +355,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
- if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
+ if ($options['is_office_document'] && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
break;
@@ -349,7 +402,7 @@ static function iterateOverNode($node, $prevName = null, $in_pre = false, $is_of
while ($n != null) {
- $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
+ $text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $options);
// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType