Fix config for HTML Tidy, fix stripped text in lists

This commit is contained in:
Nathan Gray 2011-09-06 17:22:48 +00:00
parent 8d5b990826
commit 67d7615ecc
3 changed files with 36 additions and 13 deletions

View File

@ -61,9 +61,12 @@ abstract class bo_merge
* Configuration for HTML Tidy to clean up any HTML content that is kept * Configuration for HTML Tidy to clean up any HTML content that is kept
*/ */
public static $tidy_config = array( public static $tidy_config = array(
'clean' => true, 'output-xml' => true, // Entity encoding
'output-xhtml' => true,
'show-body-only' => true, 'show-body-only' => true,
'output-encoding' => 'utf8',
'quote-ampersand' => false, // Prevent double encoding
'quote-nbsp' => true, // XSLT can handle spaces easier
'preserve-entities' => true,
); );
/** /**
@ -416,8 +419,8 @@ abstract class bo_merge
'/<\/(ol|ul|table)>/' => '</$1><w:p><w:r><w:t>', '/<\/(ol|ul|table)>/' => '</$1><w:p><w:r><w:t>',
// Fix for things other than text (newlines) inside table row // Fix for things other than text (newlines) inside table row
'/<(td)( [^>]*)?>((?!<w:t>))(.*?)<\/td>[\s]*?/' => '<$1$2><w:t>$4</w:t></td>', '/<(td)( [^>]*)?>((?!<w:t>))(.*?)<\/td>[\s]*?/' => '<$1$2><w:t>$4</w:t></td>',
'/<(li)(.*?)>(.*?)<\/\1>/' => '<$1 $2>$3</$1>',
// Remove extra whitespace // Remove extra whitespace
'/<li([^>]*?)>[^:print:]*?(.*?)<\/li>/' => '<li$1>$2</li>', // This doesn't get it all
'/<w:t>[\s]+(.*?)<\/w:t>/' => '<w:t>$1</w:t>', '/<w:t>[\s]+(.*?)<\/w:t>/' => '<w:t>$1</w:t>',
// Remove spans with no attributes, linebreaks inside them cause problems // Remove spans with no attributes, linebreaks inside them cause problems
'/<span>(.*?)<\/span>/' => '$1' '/<span>(.*?)<\/span>/' => '$1'
@ -431,6 +434,7 @@ abstract class bo_merge
$xslt->importStyleSheet($doc); $xslt->importStyleSheet($doc);
break; break;
} }
// XSLT transform known tags // XSLT transform known tags
if($xslt) if($xslt)
{ {
@ -439,6 +443,7 @@ abstract class bo_merge
$element = new SimpleXMLelement($content); $element = new SimpleXMLelement($content);
$content = @$xslt->transformToXml($element); $content = @$xslt->transformToXml($element);
//echo $content;die();
// Word 2003 needs two declarations, add extra declaration back in // Word 2003 needs two declarations, add extra declaration back in
if($mimetype == 'application/xml' && $mso_application_progid == 'Word.Document' && strpos($content, '<?xml') !== 0) { if($mimetype == 'application/xml' && $mso_application_progid == 'Word.Document' && strpos($content, '<?xml') !== 0) {
$content = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'.$content; $content = '<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'.$content;
@ -734,9 +739,11 @@ abstract class bo_merge
if (is_string($value) && (strpos($value,'<') !== false)) if (is_string($value) && (strpos($value,'<') !== false))
{ {
// Clean HTML, if it's being kept // Clean HTML, if it's being kept
if($replace_tags && extension_loaded('tidy')) if($replace_tags && extension_loaded('tidy')) {
{ $value = tidy_repair_string($value, self::$tidy_config +
$value = tidy_repair_string($value, self::$tidy_config, 'utf8'); // Need to detect encoding to get special chars right
array('input-encoding'=>mb_detect_encoding($value))
);
} }
// replace </p> and <br /> with CRLF (remove <p> and CRLF) // replace </p> and <br /> with CRLF (remove <p> and CRLF)
$value = str_replace(array("\r","\n",'<p>','</p>','<br />'),array('','','',"\r\n","\r\n"),$value); $value = str_replace(array("\r","\n",'<p>','</p>','<br />'),array('','','',"\r\n","\r\n"),$value);

View File

@ -86,11 +86,7 @@ Breakers
</xsl:template> </xsl:template>
--> -->
<xsl:template match="w:r[descendant::strong|descendant::em|descendant::u|descendant::span]"> <xsl:template name="apply-styles" match="w:r[descendant::strong|descendant::em|descendant::u|descendant::span]">
<xsl:call-template name="apply-styles"/>
</xsl:template>
<xsl:template name="apply-styles">
<xsl:for-each select="node()|@*[not(w:rPr)]"> <xsl:for-each select="node()|@*[not(w:rPr)]">
<xsl:choose> <xsl:choose>
<xsl:when test="descendant::strong|descendant::em|descendant::u|descendant::span" > <xsl:when test="descendant::strong|descendant::em|descendant::u|descendant::span" >
@ -218,7 +214,17 @@ Breakers
<xsl:choose> <xsl:choose>
<xsl:when test="count(child::*)=0"> <xsl:when test="count(child::*)=0">
<xsl:variable name="text"> <xsl:variable name="text">
<xsl:value-of select="substring-after(text(),' ')"/> <xsl:choose>
<xsl:when test="starts-with(text(), ' ')">
<xsl:value-of select="substring-after(text(),' ')"/>
</xsl:when>
<xsl:when test="starts-with(text(),'&#160;')">
<xsl:value-of select="substring-after(text(),'&#160;')"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="text()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable> </xsl:variable>
<w:r><w:t> <w:r><w:t>
<xsl:value-of select="normalize-space($text)"/> <xsl:value-of select="normalize-space($text)"/>

View File

@ -212,7 +212,17 @@ Breakers
<xsl:choose> <xsl:choose>
<xsl:when test="count(child::*)=0"> <xsl:when test="count(child::*)=0">
<xsl:variable name="text"> <xsl:variable name="text">
<xsl:value-of select="substring-after(text(),' ')"/> <xsl:choose>
<xsl:when test="starts-with(text(), ' ')">
<xsl:value-of select="substring-after(text(),' ')"/>
</xsl:when>
<xsl:when test="starts-with(text(),'&#160;')">
<xsl:value-of select="substring-after(text(),'&#160;')"/>
</xsl:when>
<xsl:otherwise>
<xsl:value-of select="text()"/>
</xsl:otherwise>
</xsl:choose>
</xsl:variable> </xsl:variable>
<w:r><w:t> <w:r><w:t>
<xsl:value-of select="normalize-space($text)"/> <xsl:value-of select="normalize-space($text)"/>