* @license http://opensource.org/licenses/lgpl-license.php LGPL - GNU Lesser General Public License * @package api * @version $Id$ */ namespace EGroupware\Api\Mail; use EGroupware\Api; /** * Html mail handling * * @todo clean up IDE warnings, mostly multiple assigments */ class Html { /** * Return the decoded string meeting some additional requirements for mailheaders * * @param string $_string -> part of an mailheader * @param string $displayCharset the charset parameter specifies the character set to represent the result by (if iconv_mime_decode is to be used) * @param integer $reclevel param to control recursive calls (to prevent endless recursion) * @return string */ static function decodeMailHeader($_string, $displayCharset='utf-8', $reclevel=0) { $maxreclevel=25; if ($reclevel > $maxreclevel) { error_log( __METHOD__.__LINE__." Recursion Level Exeeded ($reclevel) while decoding $_string "); return $_string; } $reclevel++; //error_log(__FILE__.','.__METHOD__.':'."called with $_string and CHARSET $displayCharset"); if(function_exists('imap_mime_header_decode')) { // some characterreplacements, as they fail to translate $sar = array( '@(\x84|\x93|\x94)@', '@(\x96|\x97|\x1a)@', '@(\x91|\x92)@', '@(\x85)@', '@(\x86)@', ); $rar = array( '"', '-', '\'', '...', '+', ); $elements = imap_mime_header_decode(preg_replace('/\?=\s+=\?/', '?= =?', $_string)); $convertAtEnd = false; $string = ''; foreach((array)$elements as $element) { if ($element->charset == 'default') $element->charset = Api\Translation::detect_encoding($element->text); if ($element->charset != 'x-unknown') { if( strtoupper($element->charset) != 'UTF-8') $element->text = preg_replace($sar,$rar,$element->text); // check if there is a possible nested encoding; make sure that the inputstring and the decoded result are different to avoid loops $openTags = substr_count($element->text,'?='); if(preg_match('/\?=.+=\?/', $element->text) && $openTags>0 && $openTags==substr_count($element->text,'=?') && $element->text != $_string) { $element->text = self::decodeMailHeader($element->text, $element->charset, $reclevel); $element->charset = $displayCharset; } $string .= Api\Translation::convert($element->text,$element->charset); } else { $string .= $element->text; $convertAtEnd = true; } } if ($convertAtEnd) $string = self::decodeMailHeader($string, $displayCharset, $reclevel); } elseif(function_exists(mb_decode_mimeheader)) { $matches = null; if(preg_match_all('/=\?.*\?Q\?.*\?=/iU', $string=$_string, $matches)) { foreach($matches[0] as $match) { $fixedMatch = str_replace('_', ' ', $match); $string = str_replace($match, $fixedMatch, $string); } $string = str_replace('=?ISO8859-','=?ISO-8859-', str_replace('=?windows-1258','=?ISO-8859-1',$string)); } $string = mb_decode_mimeheader($string); } elseif(function_exists(iconv_mime_decode)) { // continue decoding also if an error occurs $string = @iconv_mime_decode($_string, 2, $displayCharset); } else { // no decoding function available $string = $_string; } // remove unprintable chars return preg_replace('/([\000-\012\015\016\020-\037])/', '', $string); } /** * replace emailaddresses enclosed in <> (eg.: ) with the emailaddress only (e.g: me@you.de) * as well as those emailadresses in links, and within broken links * @param string the text to process * @return 1 */ static function replaceEmailAdresses(&$text) { //error_log($text); //replace CRLF with something other to be preserved via preg_replace as CRLF seems to vanish $text2 = str_replace("\r\n",'<#cr-lf#>',$text); // replace emailaddresses eclosed in <> (eg.: ) with the emailaddress only (e.g: me@you.de) $text3 = preg_replace("/(<|<a href=\")*(mailto:([\w\.,-.,_.,0-9.]+)(@)([\w\.,-.,_.,0-9.]+))(>|>)*/i","$2 ", $text2); //$text = preg_replace_callback("/(<|<a href=\")*(mailto:([\w\.,-.,_.,0-9.]+)(@)([\w\.,-.,_.,0-9.]+))(>|>)*/i",'self::transform_mailto2text',$text); //$text = preg_replace('~]+href=\"(mailto:)+([^"]+)\"[^>]*>~si','$2 ',$text); $text4 = preg_replace_callback('~]+href=\"(mailto:)+([^"]+)\"[^>]*>([ @\w\.,-.,_.,0-9.]+)<\/a>~si','self::transform_mailto2text',$text3); $text5 = preg_replace("/(([\w\.,-.,_.,0-9.]+)(@)([\w\.,-.,_.,0-9.]+))( |\s)*(<\/a>)*( |\s)*(>|>)*/i","$1 ", $text4); $text6 = preg_replace("/(<|<)*(([\w\.,-.,_.,0-9.]+)@([\w\.,-.,_.,0-9.]+))(>|>)*/i","$2 ", $text5); $text = str_replace('<#cr-lf#>',"\r\n",$text6); return 1; } /** * strip tags out of the message completely with their content * @param string $_body is the text to be processed * @param string $tag is the tagname which is to be removed. Note, that only the name of the tag is to be passed to the function * without the enclosing brackets * @param string $endtag can be different from tag but should be used only, if begin and endtag are known to be different e.g.: * @param bool $addbracesforendtag if endtag is given, you may decide if the braces are to be added, * or if you want the string to be matched as is * @return void the modified text is passed via reference */ static function replaceTagsCompletley(&$_body,$tag,$endtag='',$addbracesforendtag=true) { if ($tag) $tag = strtolower($tag); $singleton = false; if ($endtag=='/>') $singleton =true; if ($endtag == '' || empty($endtag) || !isset($endtag)) { $endtag = $tag; } else { $endtag = strtolower($endtag); //error_log(__METHOD__.' Using EndTag:'.$endtag); } // strip tags out of the message completely with their content if ($_body) { if ($singleton) { //$_body = preg_replace('~<'.$tag.'[^>].*? '.$endtag.'~simU','',$_body); $_body = preg_replace('~].* '.$endtag.'~simU','',$_body); // we are in Ungreedy mode, so we expect * to be ungreedy without specifying ? } else { $found=null; if ($addbracesforendtag === true ) { if (stripos($_body,'<'.$tag)!==false) $ct = preg_match_all('#<'.$tag.'(?:\s.*)?>(.+)#isU', $_body, $found); if ($ct>0) { //error_log(__METHOD__.__LINE__.array2string($found[0])); // only replace what we have found $_body = str_ireplace($found[0],'',$_body); } // remove left over tags, unfinished ones, and so on $_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body); } if ($addbracesforendtag === false ) { if (stripos($_body,'<'.$tag)!==false) $ct = preg_match_all('#<'.$tag.'(?:\s.*)?>(.+)'.$endtag.'#isU', $_body, $found); if ($ct>0) { //error_log(__METHOD__.__LINE__.array2string($found[0])); // only replace what we have found $_body = str_ireplace($found[0],'',$_body); } /* $_body = preg_replace('~<'.$tag.'[^>]*?>(.*?)'.$endtag.'~simU','',$_body); */ // remove left over tags, unfinished ones, and so on $_body = preg_replace(array('~<'.$tag.'[^>]*?>~si', '~'.$endtag.'~'), '', $_body); } } } } static function transform_mailto2text($matches) { //error_log(__METHOD__.__LINE__.array2string($matches)); // this is the actual url $matches[2] = trim(strip_tags($matches[2])); $matches[3] = trim(strip_tags($matches[3])); $matches[2] = str_replace(array('%40','%20'),array('@',' '),$matches[2]); $matches[3] = str_replace(array('%40','%20'),array('@',' '),$matches[3]); return $matches[1].$matches[2].($matches[2]==$matches[3]?' ':' -> '.$matches[3].' '); } static function transform_url2text($matches) { //error_log(__METHOD__.__LINE__.array2string($matches)); $linkTextislink = false; // this is the actual url $matches[2] = trim(strip_tags($matches[2])); if ($matches[2]==$matches[1]) $linkTextislink = true; $matches[1] = str_replace(' ','%20',$matches[1]); return ($linkTextislink?' ':'[ ').$matches[1].($linkTextislink?'':' -> '.$matches[2]).($linkTextislink?' ':' ]'); } /** * convertHTMLToText * @param string $_html : Text to be stripped down * @param string $displayCharset : charset to use; should be a valid charset * @param bool $stripcrl : flag to indicate for the removal of all crlf \r\n * @param bool $stripalltags : flag to indicate wether or not to strip $_html from all remaining tags * @param bool $noRepEmailAddr = false, if true email addresses will be intact * @return text $_html : the modified text. */ static function convertHTMLToText($_html,$displayCharset=false,$stripcrl=false,$stripalltags=true,$noRepEmailAddr = false) { // assume input isHTML, but test the input anyway, because, // if it is not, we may not want to strip whitespace $isHTML = true; if (strlen(strip_tags($_html)) == strlen($_html)) { $isHTML = false; // return $_html; // maybe we should not proceed at all } if ($displayCharset === false) $displayCharset = Api\Translation::charset(); //error_log(__METHOD__.$_html); #print '
'; #print "
"; print htmlspecialchars($_html);
		#print "
"; #print "
"; if (stripos($_html,'style')!==false) self::replaceTagsCompletley($_html,'style'); // clean out empty or pagewide style definitions / left over tags if (stripos($_html,'head')!==false) self::replaceTagsCompletley($_html,'head'); // Strip out stuff in head if (stripos($_html,'![if')!==false && stripos($_html,'')!==false) self::replaceTagsCompletley($_html,'!\[if','',false); // Strip out stuff in ifs if (stripos($_html,'!--[if')!==false && stripos($_html,'')!==false) self::replaceTagsCompletley($_html,'!--\[if','',false); // Strip out stuff in ifs $Rules = array ('@]*?>.*?@siU', // Strip out javascript '@&(quot|#34);@i', // Replace HTML entities '@&(amp|#38);@i', // Ampersand & '@&(lt|#60);@i', // Less Than < '@&(gt|#62);@i', // Greater Than > '@&(nbsp|#160);@i', // Non Breaking Space '@&(iexcl|#161);@i', // Inverted Exclamation point '@&(cent|#162);@i', // Cent '@&(pound|#163);@i', // Pound '@&(copy|#169);@i', // Copyright '@&(reg|#174);@i', // Registered '@&(trade|#8482);@i', // trade '@'@i', // singleQuote '@(\xc2\xa0)@', // nbsp or tab (encoded windows-style) '@(\xe2\x80\x8b)@', // ZERO WIDTH SPACE ); $Replace = array ('', '"', '#amper#sand#', '<', '>', ' ', chr(161), chr(162), chr(163), '(C)',//chr(169),// copyrighgt '(R)',//chr(174),// registered '(TM)',// trade "'", ' ', '', ); $_html = preg_replace($Rules, $Replace, $_html); // removing carriage return linefeeds, preserve those enclosed in
 
tags if ($stripcrl === true ) { if (stripos($_html,'
')!==false)
			{
				$contentArr = self::splithtmlByPRE($_html);
				foreach ($contentArr as $k =>&$elem)
				{
					if (stripos($elem,'
')===false)
					{
						//$elem = str_replace('@(\r\n)@i',' ',$elem);
						$elem = str_replace(array("\r\n","\n"),($isHTML?'':' '),$elem);
					}
				}
				$_html = implode('',$contentArr);
			}
			else
			{
				$_html = str_replace(array("\r\n","\n"),($isHTML?'':' '),$_html);
			}
		}
		$tags = array (
			0 => '~]*>\r*\n*~si',
			1 => '~]*>\r*\n*~si',
			2 => '~]*>\r*\n*~si',
			3 => '~]*>\r*\n*~si',
			4 => '~]*>\r*\n*~si',
			5 => '~]*>\r*\n*~si',
			6 => '~]*>~si',
			7 => '~<\/p[^>r]*>\r*\n*~si',
			8 => '~]*>\r*\n*~si',
			9 => '~]*>\r*\n*~si',
			10 => '~]*>\r*\n*~si',
			11 => '/
/', 12 => '/
/', 13 => '~
~si', 14 => '~]*>~si', 15 => '/<=\s([1234567890])/', 16 => '/>=\s([1234567890])/', 17 => '/<\s([1234567890])/', 18 => '/>\s([1234567890])/', ); $Replace = array ( 0 => "\r\n", 1 => "\r\n", 2 => "\r\n", 3 => "\r\n", 4 => "\r\n", 5 => "\r\n", 6 => "\r\n", 7 => "\r\n", 8 => "\r\n", 9 => "", 10 => "\r\n__________________________________________________\r\n", 11 => '#blockquote#type#cite#', 12 => '#blockquote#type#cite#', 13 => '#blockquote#end#cite#', 14 => '#blockquote#type#cite#', 15 => '#lowerorequal#than#$1', 16 => '#greaterorequal#than#$1', 17 => '#lower#than#$1', 18 => '#greater#than#$1', ); $_html = preg_replace($tags,$Replace,$_html); $_html = preg_replace('~\s*]*>~si',' - ',$_html); $_html = preg_replace('~]+>~s','',$_html); // replace emailaddresses eclosed in <> (eg.: ) with the emailaddress only (e.g: me@you.de) if (!$noRepEmailAddr) self::replaceEmailAdresses($_html); //convert hrefs to description -> URL //$_html = preg_replace('~]+href=\"([^"]+)\"[^>]*>(.*)~si','[$2 -> $1]',$_html); $_html = preg_replace_callback('~]+href=\"([^"]+)\"[^>]*>(.*?)~si','self::transform_url2text',$_html); // reducing double \r\n to single ones, dont mess with pre sections if ($stripcrl === true && $isHTML) { if (stripos($_html,'
')!==false)
			{
				$contentArr = self::splithtmlByPRE($_html);
				foreach ($contentArr as $k =>&$elem)
				{
					if (stripos($elem,'
')===false)
					{
						//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
						if ( $stripalltags ) {
							$_html = preg_replace('~<[^>^@]+>~s','',$_html);
						}
						// strip out whitespace inbetween CR/LF
						$elem = preg_replace('~\r\n\s+\r\n~si', "\r\n\r\n", $elem);
						// strip out / reduce exess CR/LF
						$elem = preg_replace('~\r\n{3,}~si',"\r\n\r\n",$elem);
					}
				}
				$_html = implode('',$contentArr);
			}
			else
			{
				//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
				if ( $stripalltags ) {
					$_html = preg_replace('~<[^>^@]+>~s','',$_html);
				}
				// strip out whitespace inbetween CR/LF
				$_html = preg_replace('~\r\n\s+\r\n~si', "\r\n\r\n", $_html);
				// strip out / reduce exess CR/LF
				$_html = preg_replace('~(\r\n){3,}~si',"\r\n\r\n",$_html);
			}
		}
		//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
		if ( $stripalltags ) {
			$_html = preg_replace('~<[^>^@]+>~s','',$_html);
			//$_html = strip_tags($_html, '');
		}
		// reducing spaces (not for input that was plain text from the beginning)
		if ($isHTML) $_html = preg_replace('~ +~s',' ',$_html);
		// restoring ampersands
		$_html = str_replace('#amper#sand#','&',$_html);
		// restoring lower|greater[or equal] than
		$_html = str_replace('#lowerorequal#than#','<=',$_html);
		$_html = str_replace('#greaterorequal#than#','>=',$_html);
		$_html = str_replace('#lower#than#','<',$_html);
		$_html = str_replace('#greater#than#','>',$_html);
		//error_log(__METHOD__.__LINE__.' Charset:'.$displayCharset.' -> '.$_html);
		$_html = Api\Translation::convert($_html, $displayCharset, 'utf-8');
		$_html = html_entity_decode($_html, ENT_COMPAT, 'utf-8');
		//error_log(__METHOD__.__LINE__.' Charset:'.$displayCharset.' After html_entity_decode: -> '.$_html);
		//self::replaceEmailAdresses($_html);
		$pos = strpos($_html, 'blockquote');
		//error_log("convert HTML2Text: $_html");
		if($pos === false) {
			return $_html;
		} else {
			$indent = 0;
			$indentString = '';

			$quoteParts = preg_split('/#blockquote#type#cite#/', $_html, -1, PREG_SPLIT_OFFSET_CAPTURE);
			foreach($quoteParts as $quotePart) {
				if($quotePart[1] > 0) {
					$indent++;
					$indentString .= '>';
				}
				$quoteParts2 = preg_split('/#blockquote#end#cite#/', $quotePart[0], -1, PREG_SPLIT_OFFSET_CAPTURE);

				foreach($quoteParts2 as $quotePart2) {
					if($quotePart2[1] > 0) {
						$indent--;
						$indentString = substr($indentString, 0, $indent);
					}

					$quoteParts3 = explode("\r\n", $quotePart2[0]);

					foreach($quoteParts3 as $quotePart3) {
						//error_log(__METHOD__.__LINE__.'Line:'.$quotePart3);
						$allowedLength = 76-strlen("\r\n$indentString");
						// only break lines, if not already indented
						if (substr($quotePart3,0,strlen($indentString)) != $indentString)
						{
							if (strlen($quotePart3) > $allowedLength) {
								$s=explode(" ", $quotePart3);
								$quotePart3 = "";
								$linecnt = 0;
								foreach ($s as $k=>$v) {
									$cnt = strlen($v);
									// only break long words within the wordboundaries,
									// but it may destroy links, so we check for href and dont do it if we find it
									if($cnt > $allowedLength && stripos($v,'href=')===false) {
										//error_log(__METHOD__.__LINE__.'LongWordFound:'.$v);
										$v=wordwrap($v, $allowedLength, "\r\n$indentString", true);
									}
									// the rest should be broken at the start of the new word that exceeds the limit
									if ($linecnt+$cnt > $allowedLength) {
										$v="\r\n$indentString$v";
										//error_log(__METHOD__.__LINE__.'breaking here:'.$v);
										$linecnt = 0;
									} else {
										$linecnt += $cnt;
									}
									if (strlen($v))  $quotePart3 .= (strlen($quotePart3) ? " " : "").$v;
								}
							}
						}
						//error_log(__METHOD__.__LINE__.'partString to return:'.$indentString . $quotePart3);
						$asciiTextBuff[] = $indentString . $quotePart3 ;
					}
				}
			}
			return implode("\r\n",$asciiTextBuff);
		}
	}

	/**
	 * split html by PRE tag, return array with all content pre-sections isolated in array elements
	 * @author Leithoff, Klaus
	 * @param string html
	 * @return mixed array of parts or unaffected html
	 */
	static function splithtmlByPRE($html)
	{
		$searchFor = '
',$pos);
			$length = $endofpre-$pos+6;
			$html2ret[] = substr($html,$pos,$length);
			$searchFor = '