]+href=\"(mailto:)+([^"]+)\"[^>]*>([ @\w\.,-.,_.,0-9.]+)<\/a>~si','self::transform_mailto2text',$text3);
$text5 = preg_replace("/(([\w\.,-.,_.,0-9.]+)(@)([\w\.,-.,_.,0-9.]+))( |\s)*(<\/a>)*( |\s)*(>|>)*/i","$1 ", $text4);
$text6 = preg_replace("/(<|<)*(([\w\.,-.,_.,0-9.]+)@([\w\.,-.,_.,0-9.]+))(>|>)*/i","$2 ", $text5);
$text = str_replace('<#cr-lf#>',"\r\n",$text6);
return 1;
* strip tags out of the message completely with their content
* @param string $_body is the text to be processed
* @param string $tag is the tagname which is to be removed. Note, that only the name of the tag is to be passed to the function
* without the enclosing brackets
* @param string $endtag can be different from tag but should be used only, if begin and endtag are known to be different e.g.:
* @param bool $addbracesforendtag if endtag is given, you may decide if the and > braces are to be added,
* or if you want the string to be matched as is
* @return void the modified text is passed via reference
static function replaceTagsCompletley(&$_body,$tag,$endtag='',$addbracesforendtag=true)
if ($tag) $tag = strtolower($tag);
$singleton = false;
if ($endtag=='/>') $singleton =true;
if ($endtag == '' || empty($endtag) || !isset($endtag))
$endtag = $tag;
} else {
$endtag = strtolower($endtag);
//error_log(__METHOD__.' Using EndTag:'.$endtag);
// strip tags out of the message completely with their content
if ($_body) {
if ($singleton)
//$_body = preg_replace('~<'.$tag.'[^>].*? '.$endtag.'~simU','',$_body);
$_body = preg_replace('~'.$tag.'[^>].* '.$endtag.'~simU','',$_body); // we are in Ungreedy mode, so we expect * to be ungreedy without specifying ?
if ($addbracesforendtag === true )
if (stripos($_body,'<'.$tag)!==false) $ct = preg_match_all('#<'.$tag.'(?:\s.*)?>(.+)'.$endtag.'>#isU', $_body, $found);
if ($ct>0)
// only replace what we have found
$_body = str_ireplace($found[0],'',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace('~<'.$tag.'[^>]*?>~si','',$_body);
if ($addbracesforendtag === false )
if (stripos($_body,'<'.$tag)!==false) $ct = preg_match_all('#<'.$tag.'(?:\s.*)?>(.+)'.$endtag.'#isU', $_body, $found);
if ($ct>0)
// only replace what we have found
$_body = str_ireplace($found[0],'',$_body);
$_body = preg_replace('~<'.$tag.'[^>]*?>(.*?)'.$endtag.'~simU','',$_body);
// remove left over tags, unfinished ones, and so on
$_body = preg_replace(array('~<'.$tag.'[^>]*?>~si', '~'.$endtag.'~'), '', $_body);
static function transform_mailto2text($matches)
// this is the actual url
$matches[2] = trim(strip_tags($matches[2]));
$matches[3] = trim(strip_tags($matches[3]));
$matches[2] = str_replace(array('%40','%20'),array('@',' '),$matches[2]);
$matches[3] = str_replace(array('%40','%20'),array('@',' '),$matches[3]);
return $matches[1].$matches[2].($matches[2]==$matches[3]?' ':' -> '.$matches[3].' ');
static function transform_url2text($matches)
$linkTextislink = false;
// this is the actual url
$matches[2] = trim(strip_tags($matches[2]));
if ($matches[2]==$matches[1]) $linkTextislink = true;
$matches[1] = str_replace(' ','%20',$matches[1]);
return ($linkTextislink?' ':'[ ').$matches[1].($linkTextislink?'':' -> '.$matches[2]).($linkTextislink?' ':' ]');
* convertHTMLToText
* @param string $_html : Text to be stripped down
* @param string $displayCharset : charset to use; should be a valid charset
* @param bool $stripcrl : flag to indicate for the removal of all crlf \r\n
* @param bool $stripalltags : flag to indicate wether or not to strip $_html from all remaining tags
* @param bool $noRepEmailAddr = false, if true email addresses will be intact
* @return text $_html : the modified text.
static function convertHTMLToText($_html,$displayCharset=false,$stripcrl=false,$stripalltags=true,$noRepEmailAddr = false)
// assume input isHTML, but test the input anyway, because,
// if it is not, we may not want to strip whitespace
$isHTML = true;
if (strlen(strip_tags($_html)) == strlen($_html))
$isHTML = false;
// return $_html; // maybe we should not proceed at all
if ($displayCharset === false) $displayCharset = Api\Translation::charset();
#print '
#print ""; print htmlspecialchars($_html);
#print "
#print "
if (stripos($_html,'style')!==false) self::replaceTagsCompletley($_html,'style'); // clean out empty or pagewide style definitions / left over tags
if (stripos($_html,'head')!==false) self::replaceTagsCompletley($_html,'head'); // Strip out stuff in head
if (stripos($_html,'![if')!==false && stripos($_html,'')!==false) self::replaceTagsCompletley($_html,'!\[if','',false); // Strip out stuff in ifs
if (stripos($_html,'!--[if')!==false && stripos($_html,'')!==false) self::replaceTagsCompletley($_html,'!--\[if','',false); // Strip out stuff in ifs
$Rules = array ('@@siU', // Strip out javascript
'@&(quot|#34);@i', // Replace HTML entities
'@&(amp|#38);@i', // Ampersand &
'@&(lt|#60);@i', // Less Than <
'@&(gt|#62);@i', // Greater Than >
'@&(nbsp|#160);@i', // Non Breaking Space
'@&(iexcl|#161);@i', // Inverted Exclamation point
'@&(cent|#162);@i', // Cent
'@&(pound|#163);@i', // Pound
'@&(copy|#169);@i', // Copyright
'@&(reg|#174);@i', // Registered
'@&(trade|#8482);@i', // trade
'@'@i', // singleQuote
'@(\xc2\xa0)@', // nbsp or tab (encoded windows-style)
'@(\xe2\x80\x8b)@', // ZERO WIDTH SPACE
$Replace = array ('',
' ',
'(C)',//chr(169),// copyrighgt
'(R)',//chr(174),// registered
'(TM)',// trade
' ',
$_html = preg_replace($Rules, $Replace, $_html);
// removing carriage return linefeeds, preserve those enclosed in
if ($stripcrl === true )
if (stripos($_html,'')!==false)
$contentArr = self::splithtmlByPRE($_html);
foreach ($contentArr as $k =>&$elem)
if (stripos($elem,'')===false)
//$elem = str_replace('@(\r\n)@i',' ',$elem);
$elem = str_replace(array("\r\n","\n"),($isHTML?'':' '),$elem);
$_html = implode('',$contentArr);
$_html = str_replace(array("\r\n","\n"),($isHTML?'':' '),$_html);
$tags = array (
0 => '~]*>\r*\n*~si',
1 => '~]*>\r*\n*~si',
2 => '~]*>\r*\n*~si',
3 => '~]*>\r*\n*~si',
4 => '~]*>\r*\n*~si',
5 => '~
6 => '~
7 => '~]*>\r*\n*~si',
8 => '~
9 => '~
10 => '~
11 => '//',
12 => '//',
13 => '~
14 => '~]*>~si',
15 => '/<=\s([1234567890])/',
16 => '/>=\s([1234567890])/',
17 => '/<\s([1234567890])/',
18 => '/>\s([1234567890])/',
$Replace = array (
0 => "\r\n",
1 => "\r\n",
2 => "\r\n",
3 => "\r\n",
4 => "\r\n",
5 => "\r\n",
6 => "\r\n",
7 => "\r\n",
8 => "\r\n",
9 => "",
10 => "\r\n__________________________________________________\r\n",
11 => '#blockquote#type#cite#',
12 => '#blockquote#type#cite#',
13 => '#blockquote#end#cite#',
14 => '#blockquote#type#cite#',
15 => '#lowerorequal#than#$1',
16 => '#greaterorequal#than#$1',
17 => '#lower#than#$1',
18 => '#greater#than#$1',
$_html = preg_replace($tags,$Replace,$_html);
$_html = preg_replace('~\s*]*>~si',' - ',$_html);
$_html = preg_replace('~
// replace emailaddresses eclosed in <> (eg.: ) with the emailaddress only (e.g: me@you.de)
if (!$noRepEmailAddr) self::replaceEmailAdresses($_html);
//convert hrefs to description -> URL
//$_html = preg_replace('~]+href=\"([^"]+)\"[^>]*>(.*)~si','[$2 -> $1]',$_html);
$_html = preg_replace_callback('~]+href=\"([^"]+)\"[^>]*>(.*?)~si','self::transform_url2text',$_html);
// reducing double \r\n to single ones, dont mess with pre sections
if ($stripcrl === true && $isHTML)
if (stripos($_html,'')!==false)
$contentArr = self::splithtmlByPRE($_html);
foreach ($contentArr as $k =>&$elem)
if (stripos($elem,'')===false)
//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
if ( $stripalltags ) {
$_html = preg_replace('~<[^>^@]+>~s','',$_html);
// strip out whitespace inbetween CR/LF
$elem = preg_replace('~\r\n\s+\r\n~si', "\r\n\r\n", $elem);
// strip out / reduce exess CR/LF
$elem = preg_replace('~\r\n{3,}~si',"\r\n\r\n",$elem);
$_html = implode('',$contentArr);
//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
if ( $stripalltags ) {
$_html = preg_replace('~<[^>^@]+>~s','',$_html);
// strip out whitespace inbetween CR/LF
$_html = preg_replace('~\r\n\s+\r\n~si', "\r\n\r\n", $_html);
// strip out / reduce exess CR/LF
$_html = preg_replace('~(\r\n){3,}~si',"\r\n\r\n",$_html);
//this is supposed to strip out all remaining stuff in tags, this is sometimes taking out whole sections off content
if ( $stripalltags ) {
$_html = preg_replace('~<[^>^@]+>~s','',$_html);
//$_html = strip_tags($_html, '');
// reducing spaces (not for input that was plain text from the beginning)
if ($isHTML) $_html = preg_replace('~ +~s',' ',$_html);
// restoring ampersands
$_html = str_replace('#amper#sand#','&',$_html);
// restoring lower|greater[or equal] than
$_html = str_replace('#lowerorequal#than#','<=',$_html);
$_html = str_replace('#greaterorequal#than#','>=',$_html);
$_html = str_replace('#lower#than#','<',$_html);
$_html = str_replace('#greater#than#','>',$_html);
//error_log(__METHOD__.__LINE__.' Charset:'.$displayCharset.' -> '.$_html);
$_html = Api\Translation::convert($_html, $displayCharset, 'utf-8');
$_html = html_entity_decode($_html, ENT_COMPAT, 'utf-8');
//error_log(__METHOD__.__LINE__.' Charset:'.$displayCharset.' After html_entity_decode: -> '.$_html);
$pos = strpos($_html, 'blockquote');
//error_log("convert HTML2Text: $_html");
if($pos === false) {
return $_html;
} else {
$indent = 0;
$indentString = '';
$quoteParts = preg_split('/#blockquote#type#cite#/', $_html, -1, PREG_SPLIT_OFFSET_CAPTURE);
foreach($quoteParts as $quotePart) {
if($quotePart[1] > 0) {
$indentString .= '>';
$quoteParts2 = preg_split('/#blockquote#end#cite#/', $quotePart[0], -1, PREG_SPLIT_OFFSET_CAPTURE);
foreach($quoteParts2 as $quotePart2) {
if($quotePart2[1] > 0) {
$indentString = substr($indentString, 0, $indent);
$quoteParts3 = explode("\r\n", $quotePart2[0]);
foreach($quoteParts3 as $quotePart3) {
$allowedLength = 76-strlen("\r\n$indentString");
// only break lines, if not already indented
if (substr($quotePart3,0,strlen($indentString)) != $indentString)
if (strlen($quotePart3) > $allowedLength) {
$s=explode(" ", $quotePart3);
$quotePart3 = "";
$linecnt = 0;
foreach ($s as $k=>$v) {
$cnt = strlen($v);
// only break long words within the wordboundaries,
// but it may destroy links, so we check for href and dont do it if we find it
if($cnt > $allowedLength && stripos($v,'href=')===false) {
$v=wordwrap($v, $allowedLength, "\r\n$indentString", true);
// the rest should be broken at the start of the new word that exceeds the limit
if ($linecnt+$cnt > $allowedLength) {
//error_log(__METHOD__.__LINE__.'breaking here:'.$v);
$linecnt = 0;
} else {
$linecnt += $cnt;
if (strlen($v)) $quotePart3 .= (strlen($quotePart3) ? " " : "").$v;
//error_log(__METHOD__.__LINE__.'partString to return:'.$indentString . $quotePart3);
$asciiTextBuff[] = $indentString . $quotePart3 ;
return implode("\r\n",$asciiTextBuff);
* split html by PRE tag, return array with all content pre-sections isolated in array elements
* @author Leithoff, Klaus
* @param string html
* @return mixed array of parts or unaffected html
static function splithtmlByPRE($html)
$searchFor = '';
$pos = stripos($html,$searchFor);
if ($pos === false)
return $html;
$html2ret[] = substr($html,0,$pos);
while ($pos!==false)
$endofpre = stripos($html,'
$length = $endofpre-$pos+6;
$html2ret[] = substr($html,$pos,$length);
$searchFor = '';
$pos = stripos($html,$searchFor, $endofpre+6);
$html2ret[] = ($pos ? substr($html,$endofpre+6,$pos-($endofpre+6)): substr($html,$endofpre+6));
return $html2ret;