From f06cf7bb0147389930fda2b6e61e2e4c807f3a14 Mon Sep 17 00:00:00 2001 From: Klaus Leithoff Date: Tue, 27 Oct 2015 09:22:12 +0000 Subject: [PATCH] reintroduce tidy in calls from egw-mail-app when only getCleanHTML is called, as it is correcting html structure issues for us --- .../inc/class.emailadmin_imapbase.inc.php | 17 +------ mail/inc/class.mail_compose.inc.php | 16 +++++++ mail/inc/class.mail_ui.inc.php | 46 +++++++++++++++++-- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/emailadmin/inc/class.emailadmin_imapbase.inc.php b/emailadmin/inc/class.emailadmin_imapbase.inc.php index 36f738a271..893e974b44 100644 --- a/emailadmin/inc/class.emailadmin_imapbase.inc.php +++ b/emailadmin/inc/class.emailadmin_imapbase.inc.php @@ -4687,23 +4687,8 @@ class emailadmin_imapbase if ($bodyParts[$i]['charSet']===false) $bodyParts[$i]['charSet'] = translation::detect_encoding($bodyParts[$i]['body']); // add line breaks to $bodyParts //error_log(__METHOD__.' ('.__LINE__.') '.' Charset:'.$bodyParts[$i]['charSet'].'->'.$bodyParts[$i]['body']); - $newBody = translation::convert($bodyParts[$i]['body'], $bodyParts[$i]['charSet']); + $newBody = translation::convert_jsonsafe($bodyParts[$i]['body'], $bodyParts[$i]['charSet']); //error_log(__METHOD__.' ('.__LINE__.') '.' MimeType:'.$bodyParts[$i]['mimeType'].'->'.$newBody); - /* - // in a way, this tests if we are having real utf-8 (the displayCharset) by now; we should if charsets reported (or detected) are correct - if (strtoupper(self::$displayCharset) == 'UTF-8') - { - $test = json_encode($newBody); - //error_log(__METHOD__.' ('.__LINE__.') '.'#'.$test.'# ->'.strlen($newBody).' Error:'.json_last_error()); - if (json_last_error() != JSON_ERROR_NONE && strlen($newBody)>0) - { - // this should not be needed, unless something fails with charset detection/ wrong charset passed - error_log(__METHOD__.' ('.__LINE__.') '.' Charset Reported:'.$bodyParts[$i]['charSet'].' Carset Detected:'.translation::detect_encoding($bodyParts[$i]['body'])); - $newBody = utf8_encode($newBody); - } - } - */ - //error_log(__METHOD__.' ('.__LINE__.') '.' before purify:'.$newBody); $mailClass->activeMimeType = 'text/plain'; if ($bodyParts[$i]['mimeType'] == 'text/html') { $mailClass->activeMimeType = $bodyParts[$i]['mimeType']; diff --git a/mail/inc/class.mail_compose.inc.php b/mail/inc/class.mail_compose.inc.php index ecc83c998c..bbf2e20023 100644 --- a/mail/inc/class.mail_compose.inc.php +++ b/mail/inc/class.mail_compose.inc.php @@ -2123,6 +2123,22 @@ class mail_compose static $nonDisplayAbleCharacters = array('[\016]','[\017]', '[\020]','[\021]','[\022]','[\023]','[\024]','[\025]','[\026]','[\027]', '[\030]','[\031]','[\032]','[\033]','[\034]','[\035]','[\036]','[\037]'); + + if (extension_loaded('tidy')) + { + $tidy = new tidy(); + $cleaned = $tidy->repairString($_body, mail_bo::$tidy_config,'utf8'); + // Found errors. Strip it all so there's some output + if($tidy->getStatus() == 2) + { + error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer); + } + else + { + $_body = $cleaned; + } + } + mail_bo::getCleanHTML($_body); return preg_replace($nonDisplayAbleCharacters, '', $_body); } diff --git a/mail/inc/class.mail_ui.inc.php b/mail/inc/class.mail_ui.inc.php index a057922db7..52964f5751 100644 --- a/mail/inc/class.mail_ui.inc.php +++ b/mail/inc/class.mail_ui.inc.php @@ -2778,10 +2778,50 @@ class mail_ui $newBody = $singleBodyPart['body']; //TODO:$newBody = $this->highlightQuotes($newBody); #error_log(print_r($newBody,true)); - + if (extension_loaded('tidy')) + { + $tidy = new tidy(); + $cleaned = $tidy->repairString($newBody, mail_bo::$tidy_config,'utf8'); + // Found errors. Strip it all so there's some output + if($tidy->getStatus() == 2) + { + error_log(__METHOD__.' ('.__LINE__.') '.' ->'.$tidy->errorBuffer); + } + else + { + $newBody = $cleaned; + } + if (!$preserveHTML) + { + // filter only the 'body', as we only want that part, if we throw away the html + preg_match('`(]*>)(.+?)(.*?)`ims', $newBody, $matches=array()); + if ($matches[2]) + { + $hasOther = true; + $newBody = $matches[2]; + } + } + } + else + { + // htmLawed filter only the 'body' + preg_match('`(]*>)(.+?)(.*?)`ims', $newBody, $matches=array()); + if ($matches[2]) + { + $hasOther = true; + $newBody = $matches[2]; + } + $htmLawed = new egw_htmLawed(); + // the next line should not be needed, but produces better results on HTML 2 Text conversion, + // as we switched off HTMLaweds tidy functionality + $newBody = str_replace(array('&','

',"
 
",'
 
'),array('&','
','
','
'),$newBody); + $newBody = $htmLawed->egw_htmLawed($newBody); + if ($hasOther && $preserveHTML) $newBody = $matches[1]. $newBody. $matches[3]; + } // do the cleanup, set for the use of purifier - $newBodyBuff = $newBody; + //$newBodyBuff = $newBody; mail_bo::getCleanHTML($newBody); +/* // in a way, this tests if we are having real utf-8 (the displayCharset) by now; we should if charsets reported (or detected) are correct if (strtoupper(mail_bo::$displayCharset) == 'UTF-8') { @@ -2796,7 +2836,7 @@ class mail_ui mail_bo::$htmLawed_config['tidy'] = $tv; } } - +*/ // removes stuff between http and ?http $Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))'; // only http:// gets removed, other protocolls are shown $newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http://