From 67d7615ecca726adea37c9e44b51f3aecc098ce2 Mon Sep 17 00:00:00 2001 From: Nathan Gray Date: Tue, 6 Sep 2011 17:22:48 +0000 Subject: [PATCH] Fix config for HTML Tidy, fix stripped text in lists --- etemplate/inc/class.bo_merge.inc.php | 19 +++++++++++++------ etemplate/templates/default/msoffice.xslt | 18 ++++++++++++------ etemplate/templates/default/wordml.xslt | 12 +++++++++++- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/etemplate/inc/class.bo_merge.inc.php b/etemplate/inc/class.bo_merge.inc.php index d10c7ea51b..51de612177 100644 --- a/etemplate/inc/class.bo_merge.inc.php +++ b/etemplate/inc/class.bo_merge.inc.php @@ -61,9 +61,12 @@ abstract class bo_merge * Configuration for HTML Tidy to clean up any HTML content that is kept */ public static $tidy_config = array( - 'clean' => true, - 'output-xhtml' => true, + 'output-xml' => true, // Entity encoding 'show-body-only' => true, + 'output-encoding' => 'utf8', + 'quote-ampersand' => false, // Prevent double encoding + 'quote-nbsp' => true, // XSLT can handle spaces easier + 'preserve-entities' => true, ); /** @@ -416,8 +419,8 @@ abstract class bo_merge '/<\/(ol|ul|table)>/' => '', // Fix for things other than text (newlines) inside table row '/<(td)( [^>]*)?>((?!))(.*?)<\/td>[\s]*?/' => '<$1$2>$4', - '/<(li)(.*?)>(.*?)<\/\1>/' => '<$1 $2>$3', // Remove extra whitespace + '/]*?)>[^:print:]*?(.*?)<\/li>/' => '$2', // This doesn't get it all '/[\s]+(.*?)<\/w:t>/' => '$1', // Remove spans with no attributes, linebreaks inside them cause problems '/(.*?)<\/span>/' => '$1' @@ -431,6 +434,7 @@ abstract class bo_merge $xslt->importStyleSheet($doc); break; } + // XSLT transform known tags if($xslt) { @@ -439,6 +443,7 @@ abstract class bo_merge $element = new SimpleXMLelement($content); $content = @$xslt->transformToXml($element); +//echo $content;die(); // Word 2003 needs two declarations, add extra declaration back in if($mimetype == 'application/xml' && $mso_application_progid == 'Word.Document' && strpos($content, ''.$content; @@ -734,9 +739,11 @@ abstract class bo_merge if (is_string($value) && (strpos($value,'<') !== false)) { // Clean HTML, if it's being kept - if($replace_tags && extension_loaded('tidy')) - { - $value = tidy_repair_string($value, self::$tidy_config, 'utf8'); + if($replace_tags && extension_loaded('tidy')) { + $value = tidy_repair_string($value, self::$tidy_config + + // Need to detect encoding to get special chars right + array('input-encoding'=>mb_detect_encoding($value)) + ); } // replace

and
with CRLF (remove

and CRLF) $value = str_replace(array("\r","\n",'

','

','
'),array('','','',"\r\n","\r\n"),$value); diff --git a/etemplate/templates/default/msoffice.xslt b/etemplate/templates/default/msoffice.xslt index d39af698c1..0d7fa89688 100644 --- a/etemplate/templates/default/msoffice.xslt +++ b/etemplate/templates/default/msoffice.xslt @@ -86,11 +86,7 @@ Breakers --> - - - - - + @@ -218,7 +214,17 @@ Breakers - + + + + + + + + + + + diff --git a/etemplate/templates/default/wordml.xslt b/etemplate/templates/default/wordml.xslt index d4be265251..fb6c4a9ee6 100644 --- a/etemplate/templates/default/wordml.xslt +++ b/etemplate/templates/default/wordml.xslt @@ -212,7 +212,17 @@ Breakers - + + + + + + + + + + +