Backport of Rev28983: improve the parsing / cleaning of html messages

This commit is contained in:
Klaus Leithoff 2010-02-01 15:35:53 +00:00
parent 8c6881db7c
commit cbcc0a38e6
5 changed files with 38 additions and 23 deletions

View File

@ -803,7 +803,7 @@
// but they matter in <pre>, so we rather don't
//$_html = str_replace("\r\n",' ',$_html);
//$_html = str_replace("\t",' ',$_html);
//error_log($_html);
self::replaceTagsCompletley($_html,'style'); // clean out empty or pagewide style definitions / left over tags
self::replaceTagsCompletley($_html,'head'); // Strip out stuff in head
self::replaceTagsCompletley($_html,'!\[if','<!\[endif\]>',false); // Strip out stuff in ifs
@ -870,7 +870,7 @@
}
else
{
#echo $_html;exit;
//echo $_html;exit;
$kses = new kses();
$kses->AddProtocol('cid');
// since check protocoll is called for every value associated to an attribute we have to add color and background-color to the valid protocolls

View File

@ -173,7 +173,7 @@
$sbody = substr($sbody, $start);
}
$llink='';
#_debug_array($addresses);
//_debug_array($addresses);
if (is_array($addresses)) ksort($addresses);
foreach ((array)$addresses as $text => $link) {
if (empty($link)) continue;
@ -1073,8 +1073,11 @@
// removes stuff between http and ?http
$Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))'; // only http:// gets removed, other protocolls are shown
$newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http://
// create links for websites
$newBody = html::activate_links($newBody);
// spamsaver emailaddress, needed to be able to apply email compose links later
$newBody = preg_replace('/'.'(?<!"|href=|href\s=\s|href=\s|href\s=)'.'mailto:([a-z0-9._-]+)@([a-z0-9_-]+)\.([a-z0-9._-]+)/i',
'<a href="#" onclick="document.location=\'mai\'+\'lto:\\1\'+unescape(\'%40\')+\'\\2.\\3\'; return false;">\\1 AT \\2 DOT \\3</a>',
$newBody);
// redirect links for websites if you use no cookies
#if (!($GLOBALS['egw_info']['server']['usecookies'])) { //do it all the time, since it does mask the mailadresses in urls
$this->parseHREF($newBody);

View File

@ -367,11 +367,11 @@ function refreshFolderStatus(_nodeID,mode) {
}
var activeFolders = getTreeNodeOpenItems(nodeToRefresh,mode2use);
xajax_doXMLHTTP('felamimail.ajaxfelamimail.refreshFolderList', activeFolders);
if (fm_previewMessageID>0)
{
//setStatusMessage('<span style="font-weight: bold;">'+ lang_updating_view +'</span>');
//xajax_doXMLHTTP("felamimail.ajaxfelamimail.refreshMessagePreview",fm_previewMessageID,fm_previewMessageFolderType);
}
// if (fm_previewMessageID>0)
// {
// //setStatusMessage('<span style="font-weight: bold;">'+ lang_updating_view +'</span>');
// //xajax_doXMLHTTP("felamimail.ajaxfelamimail.refreshMessagePreview",fm_previewMessageID,fm_previewMessageFolderType);
// }
}
function refreshView() {

View File

@ -321,6 +321,7 @@
}
# Split it
//_debug_array($attr);
$attrarr = $this->_hair($attr);
# Go through $attrarr, and save the allowed attributes for this element
@ -377,6 +378,7 @@
###############################################################################
function _hair($attr)
{
//echo __METHOD__.'called<br>';
$attrarr = array();
$mode = 0;
$attrname = '';
@ -393,7 +395,9 @@
case 0: # attribute name, href for instance
if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
{
//echo 'mode 0:'.$match[0].'<br>';
$attrname = $match[1];
//echo 'mode 0 -> attrname:'.$attrname.'<br>';
$working = $mode = 1;
$attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
}
@ -404,6 +408,7 @@
$working = 1;
$mode = 2;
$attr = preg_replace('/^\s*=\s*/', '', $attr);
//echo 'mode 1:'.$attr.'<br>';
break;
}
if (preg_match('/^\s+/', $attr)) # valueless
@ -420,9 +425,10 @@
}
break;
case 2: # attribute value, a URL after href= for instance
//echo 'mode 2 Attrname:'.$attrname.'<br>';
if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) # "value"
{
$thisval = $this->_bad_protocol($match[1]);
$thisval = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
$attrarr[] = array(
'name' => $attrname,
'value' => $thisval,
@ -432,11 +438,12 @@
$working = 1;
$mode = 0;
$attr = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
//echo 'mode 2:'.$attr.'<br>';
break;
}
if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) # 'value'
{
$thisval = $this->_bad_protocol($match[1]);
$thisval = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
$attrarr[] = array(
'name' => $attrname,
'value' => $thisval,
@ -446,11 +453,12 @@
$working = 1;
$mode = 0;
$attr = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
//echo 'mode 2:'.$attr.'<br>';
break;
}
if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) # value
{
$thisval = $this->_bad_protocol($match[1]);
$thisval = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
$attrarr[] = array(
'name' => $attrname,
'value' => $thisval,
@ -513,6 +521,7 @@
###############################################################################
function _bad_protocol_once($string)
{
if ($string[0]=='#') return $string; // its an anchor, dont check for protocol any further
$string2 = preg_split('/:|&#58;|&#x3a;/i', $string, 2);
if(isset($string2[1]) && !preg_match('%/\?%',$string2[0]))
{
@ -535,21 +544,24 @@
###############################################################################
function _bad_protocol_once2($string)
{
$string2 = $this->_decode_entities($string2);
$string2 = preg_replace('/\s/', '', $string);
$string2 = $this->_decode_entities($string);
$string2 = preg_replace('/\s/', '', $string2);
$string2 = $this->_no_null($string2);
$string2 = preg_replace('/\xad+/', '', $string2); # deals with Opera "feature"
$string2 = strtolower($string2);
$allowed = false;
foreach ($this->allowed_protocols as $one_protocol)
if(is_array($this->allowed_protocols) && count($this->allowed_protocols) > 0)
{
if (strtolower($one_protocol) == $string2)
foreach ($this->allowed_protocols as $one_protocol)
{
$allowed = true;
break;
if (strtolower($one_protocol) == $string2)
{
$allowed = true;
break;
}
}
}
if ($allowed)
{
return "$string2:";

View File

@ -1072,13 +1072,13 @@ class translation
if ($_body) {
if ($addbracesforendtag === true )
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'>~sim','',$_body);
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'[\s]*>~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
}
if ($addbracesforendtag === false )
{
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~sim','',$_body);
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
$_body = preg_replace('~'.$endtag.'~','',$_body);
@ -1103,7 +1103,7 @@ class translation
#print "</pre>";
#print "<hr>";
self::replaceTagsCompletley($_html,'style');
$Rules = array ('@<script[^>]*?>.*?</script>@si', // Strip out javascript
$Rules = array ('@<script[^>]*?>.*?</script>@siU', // Strip out javascript
'@&(quot|#34);@i', // Replace HTML entities
'@&(amp|#38);@i', // Ampersand &
'@&(lt|#60);@i', // Less Than <