reintroduce tidy in calls from egw-mail-app when only getCleanHTML is called, as it is correcting html structure issues for us

This commit is contained in:
Klaus Leithoff 2015-10-27 09:16:37 +00:00
parent 9ec5116d7e
commit 714c75ef6d
2 changed files with 59 additions and 3 deletions

View File

@ -2132,6 +2132,22 @@ class mail_compose
static $nonDisplayAbleCharacters = array('[\016]','[\017]', static $nonDisplayAbleCharacters = array('[\016]','[\017]',
'[\020]','[\021]','[\022]','[\023]','[\024]','[\025]','[\026]','[\027]', '[\020]','[\021]','[\022]','[\023]','[\024]','[\025]','[\026]','[\027]',
'[\030]','[\031]','[\032]','[\033]','[\034]','[\035]','[\036]','[\037]'); '[\030]','[\031]','[\032]','[\033]','[\034]','[\035]','[\036]','[\037]');
if (extension_loaded('tidy'))
{
$tidy = new tidy();
$cleaned = $tidy->repairString($_body, mail_bo::$tidy_config,'utf8');
// Found errors. Strip it all so there's some output
if($tidy->getStatus() == 2)
{
error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer);
}
else
{
$_body = $cleaned;
}
}
mail_bo::getCleanHTML($_body); mail_bo::getCleanHTML($_body);
return preg_replace($nonDisplayAbleCharacters, '', $_body); return preg_replace($nonDisplayAbleCharacters, '', $_body);
} }

View File

@ -2801,10 +2801,50 @@ class mail_ui
$newBody = $singleBodyPart['body']; $newBody = $singleBodyPart['body'];
//TODO:$newBody = $this->highlightQuotes($newBody); //TODO:$newBody = $this->highlightQuotes($newBody);
#error_log(print_r($newBody,true)); #error_log(print_r($newBody,true));
if (extension_loaded('tidy'))
{
$tidy = new tidy();
$cleaned = $tidy->repairString($newBody, mail_bo::$tidy_config,'utf8');
// Found errors. Strip it all so there's some output
if($tidy->getStatus() == 2)
{
error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer);
}
else
{
$newBody = $cleaned;
}
if (!$preserveHTML)
{
// filter only the 'body', as we only want that part, if we throw away the html
preg_match('`(<htm.+?<body[^>]*>)(.+?)(</body>.*?</html>)`ims', $newBody, $matches=array());
if ($matches[2])
{
$hasOther = true;
$newBody = $matches[2];
}
}
}
else
{
// htmLawed filter only the 'body'
preg_match('`(<htm.+?<body[^>]*>)(.+?)(</body>.*?</html>)`ims', $newBody, $matches=array());
if ($matches[2])
{
$hasOther = true;
$newBody = $matches[2];
}
$htmLawed = new egw_htmLawed();
// the next line should not be needed, but produces better results on HTML 2 Text conversion,
// as we switched off HTMLaweds tidy functionality
$newBody = str_replace(array('&amp;amp;','<DIV><BR></DIV>',"<DIV>&nbsp;</DIV>",'<div>&nbsp;</div>'),array('&amp;','<BR>','<BR>','<BR>'),$newBody);
$newBody = $htmLawed->egw_htmLawed($newBody);
if ($hasOther && $preserveHTML) $newBody = $matches[1]. $newBody. $matches[3];
}
// do the cleanup, set for the use of purifier // do the cleanup, set for the use of purifier
$newBodyBuff = $newBody; //$newBodyBuff = $newBody;
mail_bo::getCleanHTML($newBody); mail_bo::getCleanHTML($newBody);
/*
// in a way, this tests if we are having real utf-8 (the displayCharset) by now; we should if charsets reported (or detected) are correct // in a way, this tests if we are having real utf-8 (the displayCharset) by now; we should if charsets reported (or detected) are correct
if (strtoupper(mail_bo::$displayCharset) == 'UTF-8') if (strtoupper(mail_bo::$displayCharset) == 'UTF-8')
{ {
@ -2819,7 +2859,7 @@ class mail_ui
mail_bo::$htmLawed_config['tidy'] = $tv; mail_bo::$htmLawed_config['tidy'] = $tv;
} }
} }
*/
// removes stuff between http and ?http // removes stuff between http and ?http
$Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))'; // only http:// gets removed, other protocolls are shown $Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))'; // only http:// gets removed, other protocolls are shown
$newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http:// $newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http://