Backport of Rev28983: improve the parsing / cleaning of html messages

2025-02-17 19:01:04 +01:00 · 2010-02-01 15:35:53 +00:00 · 2010-02-01 15:35:53 +00:00 · cbcc0a38e6
commit cbcc0a38e6
parent 8c6881db7c
5 changed files with 38 additions and 23 deletions
--- a/felamimail/inc/class.bofelamimail.inc.php
+++ b/felamimail/inc/class.bofelamimail.inc.php
@ -803,7 +803,7 @@
 			// but they matter in <pre>, so we rather don't
 			//$_html = str_replace("\r\n",' ',$_html);
 			//$_html = str_replace("\t",' ',$_html);
-
+			//error_log($_html);
 			self::replaceTagsCompletley($_html,'style'); // clean out empty or pagewide style definitions / left over tags
 			self::replaceTagsCompletley($_html,'head'); // Strip out stuff in head	
 			self::replaceTagsCompletley($_html,'!\[if','<!\[endif\]>',false); // Strip out stuff in ifs	
@ -870,7 +870,7 @@
 			}
 			else
 			{
-				#echo $_html;exit;
+				//echo $_html;exit;
 				$kses = new kses();
 				$kses->AddProtocol('cid');
 				// since check protocoll is called for every value associated to an attribute we have to add color and background-color to the valid protocolls
--- a/felamimail/inc/class.uidisplay.inc.php
+++ b/felamimail/inc/class.uidisplay.inc.php
@ -173,7 +173,7 @@
 				$sbody = substr($sbody, $start);
 			}
 			$llink='';
-			#_debug_array($addresses);
+			//_debug_array($addresses);
 			if (is_array($addresses)) ksort($addresses);
 			foreach ((array)$addresses as $text => $link) {
 				if (empty($link)) continue;
@ -1073,8 +1073,11 @@
 					// removes stuff between http and ?http
 					$Protocol = '(http:\/\/|(ftp:\/\/|https:\/\/))';    // only http:// gets removed, other protocolls are shown
 					$newBody = preg_replace('~'.$Protocol.'[^>]*\?'.$Protocol.'~sim','$1',$newBody); // removes stuff between http:// and ?http://
-					// create links for websites
-					$newBody = html::activate_links($newBody);
+					// spamsaver emailaddress, needed to be able to apply email compose links later
+					$newBody = preg_replace('/'.'(?<!"|href=|href\s=\s|href=\s|href\s=)'.'mailto:([a-z0-9._-]+)@([a-z0-9_-]+)\.([a-z0-9._-]+)/i',
+						'<a href="#" onclick="document.location=\'mai\'+\'lto:\\1\'+unescape(\'%40\')+\'\\2.\\3\'; return false;">\\1 AT \\2 DOT \\3</a>',
+						$newBody);
+
 					// redirect links for websites if you use no cookies
 					#if (!($GLOBALS['egw_info']['server']['usecookies'])) { //do it all the time, since it does mask the mailadresses in urls
 						$this->parseHREF($newBody);
--- a/felamimail/js/jscode/viewMainScreen.js
+++ b/felamimail/js/jscode/viewMainScreen.js
@ -367,11 +367,11 @@ function refreshFolderStatus(_nodeID,mode) {
 	}
 	var activeFolders = getTreeNodeOpenItems(nodeToRefresh,mode2use);
 	xajax_doXMLHTTP('felamimail.ajaxfelamimail.refreshFolderList', activeFolders);
-	if (fm_previewMessageID>0)
-	{
-		//setStatusMessage('<span style="font-weight: bold;">'+ lang_updating_view +'</span>');
-		//xajax_doXMLHTTP("felamimail.ajaxfelamimail.refreshMessagePreview",fm_previewMessageID,fm_previewMessageFolderType);
-	}
+//	if (fm_previewMessageID>0)
+//	{
+//		//setStatusMessage('<span style="font-weight: bold;">'+ lang_updating_view +'</span>');
+//		//xajax_doXMLHTTP("felamimail.ajaxfelamimail.refreshMessagePreview",fm_previewMessageID,fm_previewMessageFolderType);
+//	}
 }

 function refreshView() {
--- a/phpgwapi/inc/class.kses.inc.php
+++ b/phpgwapi/inc/class.kses.inc.php
@ -321,6 +321,7 @@
 			}

 			# Split it
+			//_debug_array($attr);
 			$attrarr = $this->_hair($attr);
 			
 			# Go through $attrarr, and save the allowed attributes for this element
@ -377,6 +378,7 @@
 		###############################################################################
 		function _hair($attr)
 		{
+			//echo __METHOD__.'called<br>';
 			$attrarr  = array();
 			$mode     = 0;
 			$attrname = '';
@ -393,7 +395,9 @@
 					case 0:	# attribute name, href for instance
 						if (preg_match('/^([-a-zA-Z]+)/', $attr, $match))
 						{
+							//echo 'mode 0:'.$match[0].'<br>';
 							$attrname = $match[1];
+							//echo 'mode 0 -> attrname:'.$attrname.'<br>';
 							$working = $mode = 1;
 							$attr = preg_replace('/^[-a-zA-Z]+/', '', $attr);
 						}
@ -404,6 +408,7 @@
 							$working = 1;
 							$mode    = 2;
 							$attr    = preg_replace('/^\s*=\s*/', '', $attr);
+							//echo 'mode 1:'.$attr.'<br>';
 							break;
 						}
 						if (preg_match('/^\s+/', $attr)) # valueless
@ -420,9 +425,10 @@
 						}
 						break;
 					case 2: # attribute value, a URL after href= for instance
+							//echo 'mode 2 Attrname:'.$attrname.'<br>';
 						if (preg_match('/^"([^"]*)"(\s+|$)/', $attr, $match)) # "value"
 						{
-							$thisval   = $this->_bad_protocol($match[1]);
+							$thisval   = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
 							$attrarr[] = array(
 								'name'  => $attrname,
 								'value' => $thisval,
@ -432,11 +438,12 @@
 							$working   = 1;
 							$mode      = 0;
 							$attr      = preg_replace('/^"[^"]*"(\s+|$)/', '', $attr);
+							//echo 'mode 2:'.$attr.'<br>';
 							break;
 						}
 						if (preg_match("/^'([^']*)'(\s+|$)/", $attr, $match)) # 'value'
 						{
-							$thisval   = $this->_bad_protocol($match[1]);
+							$thisval   = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
 							$attrarr[] = array(
 								'name'  => $attrname,
 								'value' => $thisval,
@ -446,11 +453,12 @@
 							$working   = 1;
 							$mode      = 0;
 							$attr      = preg_replace("/^'[^']*'(\s+|$)/", '', $attr);
+							//echo 'mode 2:'.$attr.'<br>';
 							break;
 						}
 						if (preg_match("%^([^\s\"']+)(\s+|$)%", $attr, $match)) # value
 						{
-							$thisval   = $this->_bad_protocol($match[1]);
+							$thisval   = ($attrname == 'name' ? $match[1] : $this->_bad_protocol($match[1]));
 							$attrarr[] = array(
 								'name'  => $attrname,
 								'value' => $thisval,
@ -513,6 +521,7 @@
 		###############################################################################
 		function _bad_protocol_once($string)
 		{
+			if ($string[0]=='#') return $string; // its an anchor, dont check for protocol any further
 			$string2 = preg_split('/:|&#58;|&#x3a;/i', $string, 2);
 			if(isset($string2[1]) && !preg_match('%/\?%',$string2[0]))
 			{
@ -535,21 +544,24 @@
 		###############################################################################
 		function _bad_protocol_once2($string)
 		{
-			$string2 = $this->_decode_entities($string2);
-			$string2 = preg_replace('/\s/', '', $string);
+			$string2 = $this->_decode_entities($string);
+			$string2 = preg_replace('/\s/', '', $string2);
 			$string2 = $this->_no_null($string2);
+			$string2 = preg_replace('/\xad+/', '', $string2); # deals with Opera "feature"
 			$string2 = strtolower($string2);

 			$allowed = false;
-			foreach ($this->allowed_protocols as $one_protocol)
+			if(is_array($this->allowed_protocols) && count($this->allowed_protocols) > 0)
 			{
-				if (strtolower($one_protocol) == $string2)
+				foreach ($this->allowed_protocols as $one_protocol)
 				{
-					$allowed = true;
-					break;
+					if (strtolower($one_protocol) == $string2)
+					{
+						$allowed = true;
+						break;
+					}
 				}
 			}
-
 			if ($allowed)
 			{
 				return "$string2:";
--- a/phpgwapi/inc/class.translation.inc.php
+++ b/phpgwapi/inc/class.translation.inc.php
@ -1072,13 +1072,13 @@ class translation
 		if ($_body) {
 			if ($addbracesforendtag === true )
 			{
-				$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'>~sim','',$_body);
+				$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)</'.$endtag.'[\s]*>~simU','',$_body);
 				// remove left over tags, unfinished ones, and so on
 				$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
 			}
 			if ($addbracesforendtag === false )
 			{
-				$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~sim','',$_body);
+				$_body = preg_replace('~<'.$tag.'[^>]*?>(.*)'.$endtag.'~simU','',$_body);
 				// remove left over tags, unfinished ones, and so on
 				$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
 				$_body = preg_replace('~'.$endtag.'~','',$_body);
@ -1103,7 +1103,7 @@ class translation
 		#print "</pre>";
 		#print "<hr>";
 		self::replaceTagsCompletley($_html,'style');
-		$Rules = array ('@<script[^>]*?>.*?</script>@si', // Strip out javascript
+		$Rules = array ('@<script[^>]*?>.*?</script>@siU', // Strip out javascript
 			'@&(quot|#34);@i',                // Replace HTML entities
 			'@&(amp|#38);@i',                 //   Ampersand &
 			'@&(lt|#60);@i',                  //   Less Than <