From 67d7615ecca726adea37c9e44b51f3aecc098ce2 Mon Sep 17 00:00:00 2001
From: Nathan Gray
Date: Tue, 6 Sep 2011 17:22:48 +0000
Subject: [PATCH] Fix config for HTML Tidy, fix stripped text in lists
---
etemplate/inc/class.bo_merge.inc.php | 19 +++++++++++++------
etemplate/templates/default/msoffice.xslt | 18 ++++++++++++------
etemplate/templates/default/wordml.xslt | 12 +++++++++++-
3 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/etemplate/inc/class.bo_merge.inc.php b/etemplate/inc/class.bo_merge.inc.php
index d10c7ea51b..51de612177 100644
--- a/etemplate/inc/class.bo_merge.inc.php
+++ b/etemplate/inc/class.bo_merge.inc.php
@@ -61,9 +61,12 @@ abstract class bo_merge
* Configuration for HTML Tidy to clean up any HTML content that is kept
*/
public static $tidy_config = array(
- 'clean' => true,
- 'output-xhtml' => true,
+ 'output-xml' => true, // Entity encoding
'show-body-only' => true,
+ 'output-encoding' => 'utf8',
+ 'quote-ampersand' => false, // Prevent double encoding
+ 'quote-nbsp' => true, // XSLT can handle spaces easier
+ 'preserve-entities' => true,
);
/**
@@ -416,8 +419,8 @@ abstract class bo_merge
'/<\/(ol|ul|table)>/' => '$1>',
// Fix for things other than text (newlines) inside table row
'/<(td)( [^>]*)?>((?!))(.*?)<\/td>[\s]*?/' => '<$1$2>$4',
- '/<(li)(.*?)>(.*?)<\/\1>/' => '<$1 $2>$3$1>',
// Remove extra whitespace
+ '/]*?)>[^:print:]*?(.*?)<\/li>/' => '$2', // This doesn't get it all
'/[\s]+(.*?)<\/w:t>/' => '$1',
// Remove spans with no attributes, linebreaks inside them cause problems
'/(.*?)<\/span>/' => '$1'
@@ -431,6 +434,7 @@ abstract class bo_merge
$xslt->importStyleSheet($doc);
break;
}
+
// XSLT transform known tags
if($xslt)
{
@@ -439,6 +443,7 @@ abstract class bo_merge
$element = new SimpleXMLelement($content);
$content = @$xslt->transformToXml($element);
+//echo $content;die();
// Word 2003 needs two declarations, add extra declaration back in
if($mimetype == 'application/xml' && $mso_application_progid == 'Word.Document' && strpos($content, ''.$content;
@@ -734,9 +739,11 @@ abstract class bo_merge
if (is_string($value) && (strpos($value,'<') !== false))
{
// Clean HTML, if it's being kept
- if($replace_tags && extension_loaded('tidy'))
- {
- $value = tidy_repair_string($value, self::$tidy_config, 'utf8');
+ if($replace_tags && extension_loaded('tidy')) {
+ $value = tidy_repair_string($value, self::$tidy_config +
+ // Need to detect encoding to get special chars right
+ array('input-encoding'=>mb_detect_encoding($value))
+ );
}
// replace
and
with CRLF (remove and CRLF)
$value = str_replace(array("\r","\n",'
','
','
'),array('','','',"\r\n","\r\n"),$value);
diff --git a/etemplate/templates/default/msoffice.xslt b/etemplate/templates/default/msoffice.xslt
index d39af698c1..0d7fa89688 100644
--- a/etemplate/templates/default/msoffice.xslt
+++ b/etemplate/templates/default/msoffice.xslt
@@ -86,11 +86,7 @@ Breakers
-->
-
-
-
-
-
+
@@ -218,7 +214,17 @@ Breakers
-
+
+
+
+
+
+
+
+
+
+
+
diff --git a/etemplate/templates/default/wordml.xslt b/etemplate/templates/default/wordml.xslt
index d4be265251..fb6c4a9ee6 100644
--- a/etemplate/templates/default/wordml.xslt
+++ b/etemplate/templates/default/wordml.xslt
@@ -212,7 +212,17 @@ Breakers
-
+
+
+
+
+
+
+
+
+
+
+