From 714c75ef6daf31eeec36f1d88cb8819839e938c7 Mon Sep 17 00:00:00 2001 From: Klaus Leithoff Date: Tue, 27 Oct 2015 09:16:37 +0000 Subject: [PATCH] reintroduce tidy in calls from egw-mail-app when only getCleanHTML is called, as it is correcting html structure issues for us --- mail/inc/class.mail_compose.inc.php | 16 ++++++++++ mail/inc/class.mail_ui.inc.php | 46 +++++++++++++++++++++++++++-- 2 files changed, 59 insertions(+), 3 deletions(-) diff --git a/mail/inc/class.mail_compose.inc.php b/mail/inc/class.mail_compose.inc.php index b98a44606e..fadbf0851a 100644 --- a/mail/inc/class.mail_compose.inc.php +++ b/mail/inc/class.mail_compose.inc.php @@ -2132,6 +2132,22 @@ class mail_compose static $nonDisplayAbleCharacters = array('[\016]','[\017]', '[\020]','[\021]','[\022]','[\023]','[\024]','[\025]','[\026]','[\027]', '[\030]','[\031]','[\032]','[\033]','[\034]','[\035]','[\036]','[\037]'); + + if (extension_loaded('tidy')) + { + $tidy = new tidy(); + $cleaned = $tidy->repairString($_body, mail_bo::$tidy_config,'utf8'); + // Found errors. Strip it all so there's some output + if($tidy->getStatus() == 2) + { + error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer); + } + else + { + $_body = $cleaned; + } + } + mail_bo::getCleanHTML($_body); return preg_replace($nonDisplayAbleCharacters, '', $_body); } diff --git a/mail/inc/class.mail_ui.inc.php b/mail/inc/class.mail_ui.inc.php index 589d14efda..9f733fc1bf 100644 --- a/mail/inc/class.mail_ui.inc.php +++ b/mail/inc/class.mail_ui.inc.php @@ -2801,10 +2801,50 @@ class mail_ui $newBody = $singleBodyPart['body']; //TODO:$newBody = $this->highlightQuotes($newBody); #error_log(print_r($newBody,true)); - + if (extension_loaded('tidy')) + { + $tidy = new tidy(); + $cleaned = $tidy->repairString($newBody, mail_bo::$tidy_config,'utf8'); + // Found errors. Strip it all so there's some output + if($tidy->getStatus() == 2) + { + error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer); + } + else + { + $newBody = $cleaned; + } + if (!$preserveHTML) + { + // filter only the 'body', as we only want that part, if we throw away the html + preg_match('`(]*>)(.+?)(.*?)`ims', $newBody, $matches=array()); + if ($matches[2]) + { + $hasOther = true; + $newBody = $matches[2]; + } + } + } + else + { + // htmLawed filter only the 'body' + preg_match('`(]*>)(.+?)(.*?)`ims', $newBody, $matches=array()); + if ($matches[2]) + { + $hasOther = true; + $newBody = $matches[2]; + } + $htmLawed = new egw_htmLawed(); + // the next line should not be needed, but produces better results on HTML 2 Text conversion, + // as we switched off HTMLaweds tidy functionality + $newBody = str_replace(array('&','

',"
 
",'
 
'),array('&','
','
','
'),$newBody); + $newBody = $htmLawed->egw_htmLawed($newBody); + if ($hasOther && $preserveHTML) $newBody = $matches[1]. $newBody. $matches[3]; + } // do the cleanup, set for the use of purifier - $newBodyBuff = $newBody; + //$newBodyBuff = $newBody; mail_bo::getCleanHTML($newBody); +/* // in a way, this tests if we are having real utf-8 (the displayCharset) by now; we should if charsets reported (or detected) are correct if (strtoupper(mail_bo::$displayCharset) == 'UTF-8') { @@ -2819,7 +2859,7 @@ class mail_ui mail_bo::$htmLawed_config['tidy'] = $tv; } } - +*/ // removes stuff between http and ?http $Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))'; // only http:// gets removed, other protocolls are shown $newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http://